npm - @dvai-bridge/android-litert-core - Versions diffs - 4.0.0 - Mend

@dvai-bridge/android-litert-core 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/android/src/main/java/co/deepvoiceai/bridge/litert/core/Internal/HFTokenizerJson.kt ADDED Viewed

@@ -0,0 +1,380 @@
+package co.deepvoiceai.bridge.litert.core.Internal
+import co.deepvoiceai.bridge.litert.core.LiteRTBackendError
+import kotlinx.serialization.json.Json
+import kotlinx.serialization.json.JsonArray
+import kotlinx.serialization.json.JsonObject
+import kotlinx.serialization.json.JsonPrimitive
+import kotlinx.serialization.json.booleanOrNull
+import kotlinx.serialization.json.contentOrNull
+import kotlinx.serialization.json.intOrNull
+import kotlinx.serialization.json.jsonArray
+import kotlinx.serialization.json.jsonObject
+import kotlinx.serialization.json.jsonPrimitive
+import java.io.File
+/**
+ * Pure-Kotlin BPE tokenizer that loads HuggingFace's standard
+ * `tokenizer.json` schema. No JNI, no native library — works on every
+ * Android ABI without surprise UnsatisfiedLinkErrors.
+ *
+ * Why a custom parser instead of an off-the-shelf artifact?
+ *  - `com.github.huggingface:tokenizers-android` (JitPack) does not exist —
+ *    the URL 401s. The plan's original guess was wrong.
+ *  - `ai.djl.huggingface:tokenizers:0.36.0` (Maven Central) is JVM-only:
+ *    DJL ships `libtokenizers.so` for x86_64 + aarch64-linux-gnu + macOS +
+ *    Windows but NOT for Android (`*-linux-android`). Pulling DJL would
+ *    crash the first encode() with UnsatisfiedLinkError on every Android
+ *    target ABI.
+ *  - HF's official Rust crate has no Android JNI wrapper on Maven.
+ *
+ * What's supported:
+ *  - BPE merges (byte-pair encoding) — the standard Llama-3 / Gemma-2
+ *    tokenizer.json shape: `model.type == "BPE"`, `model.vocab` as
+ *    {token: id}, `model.merges` as space-separated `"A B"` pairs OR
+ *    array-of-pair tuples (HF v0.21+ format).
+ *  - Special / added tokens via `added_tokens` array (each entry is
+ *    `{ id, content, special }`).
+ *  - GPT-2 byte-level encoding pre-tokenizer (the standard Llama-3 case):
+ *    every input byte mapped through GPT-2's printable byte permutation
+ *    so the BPE step never has to handle unicode-class boundaries.
+ *  - decode() reverses the byte-level mapping and concatenates pieces.
+ *
+ * What's NOT supported (call sites must avoid these models):
+ *  - SentencePiece / Unigram tokenizers (`model.type == "Unigram"`) — Gemma
+ *    uses these; for Gemma checkpoints the consumer should use the
+ *    mediapipe backend instead which uses LiteRT-LM's bundled SentencePiece.
+ *  - Jinja chat templates from `tokenizer_config.json`. The handler layer
+ *    formats messages with a hard-coded Llama-3-style template
+ *    (`<|begin_of_text|><|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>`)
+ *    which works for Llama-3 family checkpoints; non-Llama checkpoints
+ *    require the consumer to pre-render the prompt themselves.
+ *  - Pre-tokenizer types other than ByteLevel (Whitespace, Sequence, etc.).
+ *
+ * If the loader encounters an unsupported tokenizer.json shape it throws
+ * [LiteRTBackendError.TokenizerLoadFailed] with a precise reason so the
+ * caller can fall back to a different backend.
+ *
+ * Mirrors the role of `CoreMLTokenizer.swift` (iOS) without the
+ * swift-transformers dependency — there's no equivalent maintained library
+ * on Android.
+ */
+internal class HFTokenizerJson private constructor(
+    private val vocab: Map<String, Int>,
+    private val idToToken: Map<Int, String>,
+    private val mergeRanks: Map<Pair<String, String>, Int>,
+    private val specialTokens: Set<String>,
+    private val byteToUnicode: Map<Int, Char>,
+    private val unicodeToByte: Map<Char, Int>,
+    val bosTokenId: Int?,
+    val eosTokenId: Int,
+    val padTokenId: Int?,
+) {
+    /**
+     * Encode a UTF-8 string to a token-id list using BPE.
+     *
+     * Pipeline: UTF-8 bytes -> GPT-2 byte→unicode permutation -> BPE merges
+     * -> vocab lookup. Special tokens in the input string are matched
+     * verbatim before BPE runs (so `<|eot_id|>` resolves to a single id
+     * rather than being split into pieces).
+     */
+    fun encode(text: String): List<Int> {
+        if (text.isEmpty()) return emptyList()
+        val out = mutableListOf<Int>()
+        // Greedy special-token splitter: for each occurrence of a known
+        // special token, emit it as a single id; BPE the gap before it.
+        var cursor = 0
+        while (cursor < text.length) {
+            val match = findNextSpecial(text, cursor)
+            if (match == null) {
+                val tail = text.substring(cursor)
+                if (tail.isNotEmpty()) out.addAll(encodeBpe(tail))
+                break
+            }
+            // BPE the plain segment before the special token, then emit
+            // the special token id, then advance past it.
+            if (match.start > cursor) {
+                out.addAll(encodeBpe(text.substring(cursor, match.start)))
+            }
+            out.add(vocab.getValue(match.token))
+            cursor = match.start + match.token.length
+        }
+        return out
+    }
+    private data class SpecialMatch(val token: String, val start: Int)
+    /** Earliest-occurrence special token at or after [from]. Null if none. */
+    private fun findNextSpecial(text: String, from: Int): SpecialMatch? {
+        var best: SpecialMatch? = null
+        for (special in specialTokens) {
+            val idx = text.indexOf(special, from)
+            if (idx < 0) continue
+            if (best == null || idx < best.start) {
+                best = SpecialMatch(special, idx)
+            }
+        }
+        return best
+    }
+    private fun encodeBpe(text: String): List<Int> {
+        // GPT-2 byte-level: every UTF-8 byte mapped to a single unicode
+        // character via the byteToUnicode permutation, then BPE operates
+        // over the resulting string as a single "word" (HF tokenizer.json
+        // ByteLevel pre-tokenizer's default is to NOT split on whitespace
+        // for Llama-3 — every word boundary is preserved as a Ġ-prefixed
+        // piece during merges).
+        val bytes = text.toByteArray(Charsets.UTF_8)
+        val mapped = StringBuilder(bytes.size)
+        for (b in bytes) {
+            val unsigned = b.toInt() and 0xFF
+            mapped.append(byteToUnicode.getValue(unsigned))
+        }
+        return bpe(mapped.toString())
+    }
+    /**
+     * Apply BPE merges greedily to a single byte-level-encoded "word".
+     *
+     * Standard HF BPE algorithm:
+     *  1. Split the word into individual chars.
+     *  2. Find the pair with the lowest merge-rank among adjacent pairs.
+     *  3. Merge that pair everywhere it occurs in the symbol list.
+     *  4. Repeat until no more merges apply.
+     *  5. Look up each resulting symbol in the vocab.
+     */
+    private fun bpe(word: String): List<Int> {
+        if (word.isEmpty()) return emptyList()
+        val symbols = word.map { it.toString() }.toMutableList()
+        if (symbols.size == 1) {
+            return listOf(vocab[symbols[0]] ?: vocab.getValue("<unk>"))
+        }
+        while (symbols.size >= 2) {
+            // Find lowest-rank adjacent pair.
+            var bestRank = Int.MAX_VALUE
+            var bestIdx = -1
+            for (i in 0 until symbols.size - 1) {
+                val rank = mergeRanks[symbols[i] to symbols[i + 1]] ?: continue
+                if (rank < bestRank) {
+                    bestRank = rank
+                    bestIdx = i
+                }
+            }
+            if (bestIdx < 0) break
+            // Merge every occurrence of the best pair, left-to-right.
+            val left = symbols[bestIdx]
+            val right = symbols[bestIdx + 1]
+            val merged = left + right
+            var r = 0
+            val rebuilt = ArrayList<String>(symbols.size)
+            while (r < symbols.size) {
+                if (r < symbols.size - 1 && symbols[r] == left && symbols[r + 1] == right) {
+                    rebuilt.add(merged)
+                    r += 2
+                } else {
+                    rebuilt.add(symbols[r])
+                    r += 1
+                }
+            }
+            symbols.clear()
+            symbols.addAll(rebuilt)
+        }
+        return symbols.map { sym ->
+            vocab[sym] ?: vocab["<unk>"] ?: error("token '$sym' not in vocab and no <unk> fallback")
+        }
+    }
+    /**
+     * Decode a list of token ids back to a UTF-8 string. Reverses the
+     * byte-level mapping. Special tokens are skipped if [skipSpecialTokens]
+     * is true (the default for chat output).
+     */
+    fun decode(tokens: List<Int>, skipSpecialTokens: Boolean = true): String {
+        val pieces = StringBuilder()
+        for (id in tokens) {
+            val tok = idToToken[id] ?: continue
+            if (skipSpecialTokens && tok in specialTokens) continue
+            pieces.append(tok)
+        }
+        // Reverse the byte-level mapping: every char in `pieces` was the
+        // image of one input byte. Map each char back to its byte value
+        // and decode the resulting byte sequence as UTF-8.
+        val out = ByteArray(pieces.length)
+        var n = 0
+        for (i in pieces.indices) {
+            val byteVal = unicodeToByte[pieces[i]]
+            // Tokens added by `added_tokens` (e.g. chat-template control
+            // tokens) live OUTSIDE the byte-level alphabet — their chars
+            // are not in unicodeToByte. Skip them (or emit '?' if you want
+            // a visible artefact). For chat output, skipping is correct.
+            if (byteVal != null) {
+                out[n] = byteVal.toByte()
+                n += 1
+            }
+        }
+        return String(out, 0, n, Charsets.UTF_8)
+    }
+    fun decode(token: Int): String = decode(listOf(token), skipSpecialTokens = true)
+    companion object {
+        private val parser = Json { ignoreUnknownKeys = true }
+        /**
+         * Load a tokenizer.json from disk. Throws
+         * [LiteRTBackendError.TokenizerLoadFailed] on any parse / structure
+         * failure with a precise reason.
+         */
+        @Throws(LiteRTBackendError.TokenizerLoadFailed::class)
+        fun load(tokenizerJsonPath: String, eosTokenIdOverride: Int? = null): HFTokenizerJson {
+            val file = File(tokenizerJsonPath)
+            if (!file.isFile) {
+                throw LiteRTBackendError.TokenizerLoadFailed(
+                    "tokenizer.json not found at $tokenizerJsonPath",
+                )
+            }
+            val root = try {
+                parser.parseToJsonElement(file.readText()).jsonObject
+            } catch (t: Throwable) {
+                throw LiteRTBackendError.TokenizerLoadFailed("failed to parse tokenizer.json: ${t.message}")
+            }
+            val model = root["model"] as? JsonObject
+                ?: throw LiteRTBackendError.TokenizerLoadFailed("tokenizer.json: missing 'model' object")
+            val type = (model["type"] as? JsonPrimitive)?.contentOrNull
+            if (type != null && type != "BPE") {
+                throw LiteRTBackendError.TokenizerLoadFailed(
+                    "tokenizer.json: model.type='$type' is not supported (only BPE). Use the mediapipe backend for SentencePiece/Unigram models.",
+                )
+            }
+            val vocabRaw = model["vocab"] as? JsonObject
+                ?: throw LiteRTBackendError.TokenizerLoadFailed("tokenizer.json: missing 'model.vocab'")
+            val vocab = HashMap<String, Int>(vocabRaw.size)
+            val idToToken = HashMap<Int, String>(vocabRaw.size)
+            for ((tok, idEl) in vocabRaw) {
+                val id = (idEl as? JsonPrimitive)?.intOrNull
+                    ?: throw LiteRTBackendError.TokenizerLoadFailed("tokenizer.json: vocab entry '$tok' is not an int")
+                vocab[tok] = id
+                idToToken[id] = tok
+            }
+            val mergesRaw = model["merges"] as? JsonArray
+                ?: throw LiteRTBackendError.TokenizerLoadFailed("tokenizer.json: missing 'model.merges'")
+            val mergeRanks = HashMap<Pair<String, String>, Int>(mergesRaw.size)
+            for ((rank, mEl) in mergesRaw.withIndex()) {
+                val pair = parseMergeEntry(mEl)
+                    ?: throw LiteRTBackendError.TokenizerLoadFailed(
+                        "tokenizer.json: merges[$rank] is not a 'A B' string or [A,B] pair",
+                    )
+                mergeRanks[pair] = rank
+            }
+            // Special / added tokens. Each entry shape: { id, content, special, ... }.
+            // We treat anything with `special: true` (or anything in this list,
+            // since added_tokens are by convention always specials in modern HF
+            // tokenizer.json files) as a special token: matched verbatim by
+            // encode(), skipped by decode() when skipSpecialTokens=true.
+            val specialTokens = mutableSetOf<String>()
+            (root["added_tokens"] as? JsonArray)?.forEach { entry ->
+                val obj = entry as? JsonObject ?: return@forEach
+                val content = (obj["content"] as? JsonPrimitive)?.contentOrNull ?: return@forEach
+                val id = (obj["id"] as? JsonPrimitive)?.intOrNull
+                if (id != null) {
+                    vocab[content] = id
+                    idToToken[id] = content
+                }
+                val isSpecial = (obj["special"] as? JsonPrimitive)?.booleanOrNull ?: true
+                if (isSpecial) specialTokens.add(content)
+            }
+            // Discover BOS / EOS / PAD ids from `added_tokens` first, then
+            // from the standard names. The caller can override EOS via opts.
+            val bosTokenId = vocab["<|begin_of_text|>"] ?: vocab["<s>"] ?: vocab["<bos>"]
+            val discoveredEos = vocab["<|eot_id|>"]
+                ?: vocab["<|end_of_text|>"]
+                ?: vocab["</s>"]
+                ?: vocab["<eos>"]
+            val eosTokenId = eosTokenIdOverride ?: discoveredEos
+                ?: throw LiteRTBackendError.TokenizerLoadFailed(
+                    "tokenizer.json: no EOS-like token in added_tokens (looked for <|eot_id|>, <|end_of_text|>, </s>, <eos>) — pass eosTokenId in start opts to override",
+                )
+            val padTokenId = vocab["<pad>"] ?: vocab["<|pad|>"]
+            val (b2u, u2b) = buildByteLevelMap()
+            return HFTokenizerJson(
+                vocab = vocab,
+                idToToken = idToToken,
+                mergeRanks = mergeRanks,
+                specialTokens = specialTokens,
+                byteToUnicode = b2u,
+                unicodeToByte = u2b,
+                bosTokenId = bosTokenId,
+                eosTokenId = eosTokenId,
+                padTokenId = padTokenId,
+            )
+        }
+        /**
+         * Parse one entry of tokenizer.json's `model.merges` array. Two
+         * shapes are seen in the wild:
+         *  - String: "A B" (older HF, Llama-2-style). Split on first space.
+         *  - Array of two strings: ["A", "B"] (HF v0.21+ default).
+         */
+        private fun parseMergeEntry(el: kotlinx.serialization.json.JsonElement): Pair<String, String>? {
+            return when (el) {
+                is JsonPrimitive -> {
+                    val s = el.contentOrNull ?: return null
+                    val sp = s.indexOf(' ')
+                    if (sp < 0) return null
+                    s.substring(0, sp) to s.substring(sp + 1)
+                }
+                is JsonArray -> {
+                    if (el.size != 2) return null
+                    val a = (el[0] as? JsonPrimitive)?.contentOrNull ?: return null
+                    val b = (el[1] as? JsonPrimitive)?.contentOrNull ?: return null
+                    a to b
+                }
+                else -> null
+            }
+        }
+        /**
+         * Construct GPT-2's reversible byte→unicode permutation. Maps each
+         * of the 256 byte values to a printable unicode codepoint:
+         *  - Bytes that are already printable ASCII (33..126), Latin-1
+         *    supplement printable (161..172, 174..255) map to themselves.
+         *  - All other bytes (0..32, 127..160, 173) map to the Latin-1
+         *    Supplement / Latin-Extended-A range starting at 256, in order.
+         *
+         * Reference: HuggingFace tokenizers' ByteLevel `bytes_to_unicode()`
+         * Python helper. The output map is identical between HF Python,
+         * tokenizers Rust, and this Kotlin port.
+         */
+        private fun buildByteLevelMap(): Pair<Map<Int, Char>, Map<Char, Int>> {
+            val printable = (33..126) + (161..172) + (174..255)
+            val bs = printable.toMutableList()
+            val cs = printable.map { it }.toMutableList()
+            var n = 0
+            for (b in 0..255) {
+                if (b !in printable) {
+                    bs.add(b)
+                    cs.add(256 + n)
+                    n += 1
+                }
+            }
+            val byteToChar = HashMap<Int, Char>(256)
+            val charToByte = HashMap<Char, Int>(256)
+            for (i in bs.indices) {
+                val ch = cs[i].toChar()
+                byteToChar[bs[i]] = ch
+                charToByte[ch] = bs[i]
+            }
+            return byteToChar to charToByte
+        }
+    }
+}

package/android/src/main/java/co/deepvoiceai/bridge/litert/core/Internal/LiteRTEngine.kt ADDED Viewed

@@ -0,0 +1,241 @@
+package co.deepvoiceai.bridge.litert.core.Internal
+import co.deepvoiceai.bridge.litert.core.LiteRTBackendError
+import com.google.ai.edge.litert.Accelerator
+import com.google.ai.edge.litert.CompiledModel
+import com.google.ai.edge.litert.TensorBuffer
+import com.google.ai.edge.litert.TensorType
+import java.io.File
+/**
+ * Test seam over the LiteRT [CompiledModel]. Concrete [LiteRTEngine] runs
+ * the real native runtime; [LiteRTGenerator]'s mock test substitutes a
+ * canned-logits fake without loading a .tflite.
+ */
+internal interface LiteRTEngineApi {
+    /** Vocab size (= length of the FloatArray returned by [runStep]). */
+    val vocabSize: Int
+    /** EOS id in the model's vocab — generator uses it to terminate decode. */
+    val eosTokenId: Int
+    /**
+     * Run a single forward pass with [token] at position [kvCachePosition].
+     * Returns the logits row for the next-token prediction (length = [vocabSize]).
+     * Throws [LiteRTBackendError.GenerationFailed] on native failure.
+     */
+    fun runStep(token: Int, kvCachePosition: Int): FloatArray
+    /** Release native resources. Idempotent. */
+    fun close()
+}
+/**
+ * Wraps Google's LiteRT [CompiledModel] for a stateful Llama-style
+ * autoregressive checkpoint. Drives single-token decoding via named-tensor
+ * `run(inputs, outputs, signature)` calls.
+ *
+ * Why not use LiteRT-LM? We deliberately depend on bare `litert` (see
+ * `android/build.gradle` top-of-file comment), so the KV-cache / sampler
+ * loop is implemented here in Kotlin. The Llama-style .tflite checkpoints
+ * we target carry the cache as graph-internal state, exposed through
+ * named inputs/outputs that the runtime maintains across calls within one
+ * [CompiledModel] instance — same shape Apple's CoreML stateful Llama
+ * checkpoints follow on iOS (see `CoreMLEngine.swift`).
+ *
+ * Tensor convention (auto-detected at init via [CompiledModel.getInputTensorType]):
+ *  - [inputName]    `input_ids`     INT32 [1, 1]                       (default, overridable)
+ *  - [causalMaskName] `causal_mask` FLOAT [1, 1, 1, kv_len]            (optional)
+ *  - [outputName]   `logits`        FLOAT [1, 1, vocab] or [1, vocab]  (auto)
+ *
+ * If the model declares no `causal_mask` input we silently skip writing
+ * it — many simpler stateful checkpoints don't expose one.
+ *
+ * This class is NOT thread-safe. [LiteRTHandlers] serializes all calls
+ * behind a mutex; do the same in any other call site.
+ */
+internal class LiteRTEngine(
+    modelPath: String,
+    private val inputName: String = "input_ids",
+    private val causalMaskName: String = "causal_mask",
+    private val outputName: String = "logits",
+    /** Surface override so the handler / config layer can lift it from start opts. */
+    @Suppress("UNUSED_PARAMETER")
+    private val contextSize: Int = 2048,
+    eosTokenId: Int,
+    accelerator: Accelerator = Accelerator.CPU,
+) : LiteRTEngineApi, AutoCloseable {
+    private val model: CompiledModel
+    override val vocabSize: Int
+    override val eosTokenId: Int = eosTokenId
+    private val hasCausalMask: Boolean
+    private val causalMaskRank: Int
+    private val inputIsInt64: Boolean
+    init {
+        val f = File(modelPath)
+        if (!f.isFile) {
+            throw LiteRTBackendError.ModelLoadFailed("model file not found at $modelPath")
+        }
+        model = try {
+            CompiledModel.create(modelPath, CompiledModel.Options(accelerator))
+        } catch (t: Throwable) {
+            throw LiteRTBackendError.ModelLoadFailed("CompiledModel.create failed: ${t.message ?: t::class.java.simpleName}")
+        }
+        // Validate the input_ids tensor exists and capture its rank/dtype
+        // for the writeInt path. We don't enforce shape == [1,1] here —
+        // the model owns its declared signature; we just feed [token] as
+        // a 1-element IntArray and let the runtime broadcast / error out.
+        val inputType = try {
+            model.getInputTensorType(inputName)
+        } catch (t: Throwable) {
+            // If the named-tensor lookup fails, the consumer's checkpoint
+            // doesn't follow our convention. Fail fast with a precise
+            // message — silent fallback to default signature would only
+            // surface the real shape mismatch deep inside nativeRun().
+            model.close()
+            throw LiteRTBackendError.ModelLoadFailed(
+                "input tensor '$inputName' not found on model (override via litertInputName opt). " +
+                    "Cause: ${t.message ?: t::class.java.simpleName}",
+            )
+        }
+        // Both INT32 and INT64 input_ids are seen on Llama-style
+        // checkpoints in the wild (Llama-3 typically int32, some Gemma
+        // exports int64). We dispatch to `writeInt` vs `writeLong` based
+        // on the declared element type at runStep time, so both shapes
+        // round-trip cleanly through LiteRT's strict dtype check.
+        if (inputType.elementType != TensorType.ElementType.INT &&
+            inputType.elementType != TensorType.ElementType.INT64
+        ) {
+            model.close()
+            throw LiteRTBackendError.ModelLoadFailed(
+                "input tensor '$inputName' has unsupported elementType=${inputType.elementType}; expected INT or INT64",
+            )
+        }
+        inputIsInt64 = inputType.elementType == TensorType.ElementType.INT64
+        // causal_mask is optional — many simpler checkpoints don't expose it.
+        // Probe via a try/catch on getInputTensorType since the LiteRT API
+        // doesn't have a non-throwing "does this input exist?" call.
+        var maskRank = 0
+        val maskPresent = try {
+            val maskType = model.getInputTensorType(causalMaskName)
+            maskRank = maskType.layout?.rank ?: 0
+            true
+        } catch (_: Throwable) {
+            false
+        }
+        hasCausalMask = maskPresent
+        causalMaskRank = maskRank
+        // Discover logits rank + vocab size by inspecting the output
+        // tensor type. Handles both [1, 1, V] (Llama-3 style) and
+        // [1, V] (some Gemma exports).
+        val outputType = try {
+            model.getOutputTensorType(outputName)
+        } catch (t: Throwable) {
+            model.close()
+            throw LiteRTBackendError.ModelLoadFailed(
+                "output tensor '$outputName' not found on model (override via litertOutputName opt). " +
+                    "Cause: ${t.message ?: t::class.java.simpleName}",
+            )
+        }
+        val outDims = outputType.layout?.dimensions ?: emptyList()
+        vocabSize = when (outDims.size) {
+            3 -> outDims[2]   // [1, 1, V] — Llama-3 style
+            2 -> outDims[1]   // [1, V]    — some Gemma exports
+            else -> {
+                model.close()
+                throw LiteRTBackendError.ModelLoadFailed(
+                    "output tensor '$outputName' has unsupported rank=${outDims.size}; expected 2 or 3 with vocab as last dim",
+                )
+            }
+        }
+        if (vocabSize <= 0) {
+            model.close()
+            throw LiteRTBackendError.ModelLoadFailed(
+                "output tensor '$outputName' reports non-positive vocab size: $vocabSize",
+            )
+        }
+    }
+    override fun runStep(token: Int, kvCachePosition: Int): FloatArray {
+        val inputs = mutableMapOf<String, TensorBuffer>()
+        val outputs = mutableMapOf<String, TensorBuffer>()
+        val opened = mutableListOf<TensorBuffer>()
+        try {
+            // input_ids: [1, 1] with the new token. writeInt vs writeLong
+            // is selected from the declared element type captured at init.
+            val inputBuf = model.createInputBuffer(inputName)
+            opened.add(inputBuf)
+            if (inputIsInt64) {
+                inputBuf.writeLong(longArrayOf(token.toLong()))
+            } else {
+                inputBuf.writeInt(intArrayOf(token))
+            }
+            inputs[inputName] = inputBuf
+            // causal_mask: [1, 1, 1, kvCachePosition+1] all-zeros if the
+            // model declares one. Zero = unmasked, large negative = masked;
+            // for a single-token decode step every prior position is visible.
+            // LiteRT only exposes writeFloat for FP tensors — even if the
+            // declared dtype is FP16, the runtime accepts FP32 input and
+            // converts internally. (HF model converts also produce FP32
+            // causal_masks more often than FP16 in 2026 conversions.)
+            if (hasCausalMask && causalMaskRank > 0) {
+                val kvLen = maxOf(1, kvCachePosition + 1)
+                val maskBuf = model.createInputBuffer(causalMaskName)
+                opened.add(maskBuf)
+                // Zero-fill with size = product of the buffer's logical
+                // dimensions (we pass the full kvLen-sized buffer; LiteRT
+                // resizes dynamic-axis tensors based on writeFloat length).
+                maskBuf.writeFloat(FloatArray(kvLen))
+                inputs[causalMaskName] = maskBuf
+            }
+            val outputBuf = model.createOutputBuffer(outputName)
+            opened.add(outputBuf)
+            outputs[outputName] = outputBuf
+            try {
+                model.run(inputs, outputs)
+            } catch (t: Throwable) {
+                throw LiteRTBackendError.GenerationFailed(
+                    "model.run failed at kvPos=$kvCachePosition token=$token: ${t.message ?: t::class.java.simpleName}",
+                )
+            }
+            val raw = outputBuf.readFloat()
+            // For rank-3 logits we want the LAST row (the prediction for
+            // the *next* token). With a [1, 1, V] shape there's only one
+            // row anyway so raw IS the next-token logits — but if the
+            // checkpoint produces [1, T, V] for a multi-token prefill we'd
+            // want raw.takeLast(V). We default to the last-V slice for
+            // robustness.
+            return if (raw.size == vocabSize) {
+                raw
+            } else {
+                val start = raw.size - vocabSize
+                if (start < 0) {
+                    throw LiteRTBackendError.GenerationFailed(
+                        "logits buffer length ${raw.size} is smaller than vocabSize $vocabSize",
+                    )
+                }
+                raw.copyOfRange(start, raw.size)
+            }
+        } finally {
+            // Release every per-call TensorBuffer (createInputBuffer +
+            // createOutputBuffer allocate fresh native handles each call).
+            for (buf in opened) {
+                runCatching { buf.close() }
+            }
+        }
+    }
+    override fun close() {
+        runCatching { model.close() }
+    }
+}