npm - @dvai-bridge/ios - Versions diffs - 4.0.0 → 4.0.2 - Mend

@dvai-bridge/ios 4.0.0 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/Package.swift +104 -104
package/ios/Sources/DVAIBridge/BackendKind.swift +23 -23
package/ios/Sources/DVAIBridge/BoundServer.swift +46 -46
package/ios/Sources/DVAIBridge/DVAIBridge.swift +658 -658
package/ios/Sources/DVAIBridge/DVAIBridgeConfig.swift +86 -86
package/ios/Sources/DVAIBridge/DVAIBridgeError.swift +33 -33
package/ios/Sources/DVAIBridge/Internal/BackendSelector.swift +59 -59
package/ios/Sources/DVAIBridge/Internal/ProgressBroadcaster.swift +84 -84
package/ios/Sources/DVAIBridge/License/Audience.swift +133 -133
package/ios/Sources/DVAIBridge/License/Discovery.swift +164 -164
package/ios/Sources/DVAIBridge/License/LicenseValidator.swift +392 -392
package/ios/Sources/DVAIBridge/License/PublicKeys.swift +114 -114
package/ios/Sources/DVAIBridge/License/Types.swift +195 -195
package/ios/Sources/DVAIBridge/Offload/OffloadConfig.swift +118 -118
package/ios/Sources/DVAIBridge/ProgressEvent.swift +34 -34
package/ios/Sources/DVAICoreMLCore/CoreMLBackendError.swift +19 -19
package/ios/Sources/DVAICoreMLCore/CoreMLHandlers.swift +123 -123
package/ios/Sources/DVAICoreMLCore/CoreMLPluginState.swift +130 -130
package/ios/Sources/DVAICoreMLCore/Internal/CoreMLEngine.swift +137 -137
package/ios/Sources/DVAICoreMLCore/Internal/CoreMLGenerator.swift +108 -108
package/ios/Sources/DVAICoreMLCore/Internal/CoreMLSampler.swift +96 -96
package/ios/Sources/DVAICoreMLCore/Internal/CoreMLTokenizer.swift +69 -69
package/ios/Tests/DVAIBridgeTests/BackendSelectorTests.swift +53 -53
package/ios/Tests/DVAIBridgeTests/CoreMLEngineTests.swift +18 -18
package/ios/Tests/DVAIBridgeTests/CoreMLGeneratorShapeTests.swift +11 -11
package/ios/Tests/DVAIBridgeTests/CoreMLHandlersTests.swift +32 -32
package/ios/Tests/DVAIBridgeTests/CoreMLPluginStateTests.swift +41 -41
package/ios/Tests/DVAIBridgeTests/CoreMLSamplerTests.swift +40 -40
package/ios/Tests/DVAIBridgeTests/CoreMLTokenizerTests.swift +19 -19
package/ios/Tests/DVAIBridgeTests/DVAIBridgeAPIShapeTests.swift +37 -37
package/ios/Tests/DVAIBridgeTests/DVAIBridgeConfigTests.swift +52 -52
package/ios/Tests/DVAIBridgeTests/DVAIBridgeErrorTests.swift +33 -33
package/ios/Tests/DVAIBridgeTests/LicenseValidatorTests.swift +658 -658
package/ios/Tests/DVAIBridgeTests/ProgressBroadcasterTests.swift +69 -69
package/ios/Tests/DVAIBridgeTests/ProgressEventTests.swift +25 -25
package/ios/Tests/DVAIBridgeTests/ReactiveStateTests.swift +45 -45
package/ios/Tests/DVAIBridgeTests/RealModelIntegrationTest.swift +385 -359
package/package.json +3 -4
package/DVAIBridge.podspec +0 -120
package/LICENSE +0 -51
package/README.md +0 -199

package/ios/Sources/DVAICoreMLCore/CoreMLPluginState.swift CHANGED Viewed

@@ -1,130 +1,130 @@
-import Foundation
-import CoreML
-#if !COCOAPODS
-import DVAISharedCore   // HttpServer, DVAIHandlers, HandlerContext, CORSConfig
-#endif
-/// Public PluginState mirroring DVAILlamaCore.PluginState's shape.
-/// Boots a Telegraph HTTP server on `127.0.0.1:<port>` (with port-fallback),
-/// loads the .mlmodelc model + tokenizer, and serves OpenAI-compatible
-/// requests via CoreMLHandlers.
-///
-/// Requires iOS 18 / macOS 15 for MLState (KV-cache stateful decoding).
-@available(iOS 18.0, macOS 15.0, *)
-public actor CoreMLPluginState {
-    private var httpServer: HttpServer?
-    private var generator: CoreMLGenerator?
-    private var modelId: String = ""
-    private var isRunning: Bool = false
-    private var baseUrl: String?
-    private var port: Int?
-    public init() {}
-    public func start(opts: [String: Any]) async throws -> [String: Any] {
-        if isRunning { try await stop() }
-        guard let modelPath = opts["modelPath"] as? String, !modelPath.isEmpty else {
-            throw CoreMLBackendError.modelLoadFailed(
-                reason: "modelPath is required for the CoreML backend")
-        }
-        guard let tokenizerPath = opts["tokenizerPath"] as? String, !tokenizerPath.isEmpty else {
-            throw CoreMLBackendError.tokenizerLoadFailed(
-                reason: "tokenizerPath is required (path to a directory containing " +
-                        "tokenizer.json + tokenizer_config.json)")
-        }
-        let modelURL = URL(fileURLWithPath: modelPath)
-        let tokenizerDir = URL(fileURLWithPath: tokenizerPath)
-        // Optional opts with defaults — match Apple's stateful Llama-3.2
-        // conversion conventions (snake_case, matching HF / PyTorch).
-        let inputName = (opts["coremlInputName"] as? String) ?? "input_ids"
-        let causalMaskName = (opts["coremlCausalMaskName"] as? String) ?? "causal_mask"
-        let outputName = (opts["coremlOutputName"] as? String) ?? "logits"
-        let maxContextTokens = (opts["contextSize"] as? Int) ?? 2048
-        let temperature = (opts["temperature"] as? Double).map(Float.init) ?? 0.0
-        let topP = (opts["topP"] as? Double).map(Float.init) ?? 1.0
-        let topK = (opts["topK"] as? Int) ?? 0
-        let maxNewTokens = (opts["maxNewTokens"] as? Int) ?? 512
-        let httpBasePort = (opts["httpBasePort"] as? Int) ?? 38883
-        let httpMaxPortAttempts = (opts["httpMaxPortAttempts"] as? Int) ?? 16
-        // Load tokenizer first — its eosTokenId is needed by the engine.
-        let tokenizer = try await CoreMLTokenizer(tokenizerDir: tokenizerDir)
-        let engine = try CoreMLEngine(
-            modelURL: modelURL,
-            inputName: inputName,
-            causalMaskName: causalMaskName,
-            outputName: outputName,
-            maxContextTokens: maxContextTokens,
-            eosTokenId: tokenizer.eosTokenId
-        )
-        let sampler = CoreMLSampler(temperature: temperature, topP: topP, topK: topK)
-        let gen = CoreMLGenerator(
-            engine: engine,
-            tokenizer: tokenizer,
-            sampler: sampler,
-            maxNewTokens: maxNewTokens
-        )
-        let modelIdValue = modelURL.deletingPathExtension().lastPathComponent
-        let handlers = CoreMLHandlers(generator: gen, modelId: modelIdValue)
-        // Build context + cors first, install routes, THEN bind —
-        // Hummingbird requires routes at Application construction time
-        // so the install → bind order is mandatory.
-        let ctx = HandlerContext(modelId: modelIdValue, backendName: "coreml")
-        // Note: plan used DispatchConfig which doesn't exist in DVAILlamaCore.
-        // Real type is CORSConfig (public). parseCors() below maps opts → CORSConfig.
-        let corsConfig = parseCors(opts["corsOrigin"])
-        let server = HttpServer()
-        await server.installRoutes(handlers: handlers, ctx: ctx, corsConfig: corsConfig)
-        let boundPort = try await server.tryBind(
-            basePort: httpBasePort,
-            maxAttempts: httpMaxPortAttempts,
-            host: "127.0.0.1"
-        )
-        self.httpServer = server
-        self.generator = gen
-        self.modelId = modelIdValue
-        self.port = boundPort
-        self.baseUrl = "http://127.0.0.1:\(boundPort)/v1"
-        self.isRunning = true
-        return [
-            "baseUrl": self.baseUrl!,
-            "port": boundPort,
-            "backend": "coreml",
-            "modelId": modelIdValue,
-        ]
-    }
-    public func stop() async throws {
-        await httpServer?.stop()
-        httpServer = nil
-        generator = nil
-        modelId = ""
-        baseUrl = nil
-        port = nil
-        isRunning = false
-    }
-    public func statusInfo() -> [String: Any] {
-        var dict: [String: Any] = ["running": isRunning]
-        if let baseUrl = baseUrl { dict["baseUrl"] = baseUrl }
-        if isRunning { dict["backend"] = "coreml" }
-        return dict
-    }
-    // MARK: - Private
-    private func parseCors(_ raw: Any?) -> CORSConfig {
-        if let s = raw as? String { return s == "*" ? .wildcard : .exact(s) }
-        if let arr = raw as? [String] { return .allowlist(arr) }
-        return .wildcard
-    }
-}
+import Foundation
+import CoreML
+#if !COCOAPODS
+import DVAISharedCore   // HttpServer, DVAIHandlers, HandlerContext, CORSConfig
+#endif
+/// Public PluginState mirroring DVAILlamaCore.PluginState's shape.
+/// Boots a Telegraph HTTP server on `127.0.0.1:<port>` (with port-fallback),
+/// loads the .mlmodelc model + tokenizer, and serves OpenAI-compatible
+/// requests via CoreMLHandlers.
+///
+/// Requires iOS 18 / macOS 15 for MLState (KV-cache stateful decoding).
+@available(iOS 18.0, macOS 15.0, *)
+public actor CoreMLPluginState {
+    private var httpServer: HttpServer?
+    private var generator: CoreMLGenerator?
+    private var modelId: String = ""
+    private var isRunning: Bool = false
+    private var baseUrl: String?
+    private var port: Int?
+    public init() {}
+    public func start(opts: [String: Any]) async throws -> [String: Any] {
+        if isRunning { try await stop() }
+        guard let modelPath = opts["modelPath"] as? String, !modelPath.isEmpty else {
+            throw CoreMLBackendError.modelLoadFailed(
+                reason: "modelPath is required for the CoreML backend")
+        }
+        guard let tokenizerPath = opts["tokenizerPath"] as? String, !tokenizerPath.isEmpty else {
+            throw CoreMLBackendError.tokenizerLoadFailed(
+                reason: "tokenizerPath is required (path to a directory containing " +
+                        "tokenizer.json + tokenizer_config.json)")
+        }
+        let modelURL = URL(fileURLWithPath: modelPath)
+        let tokenizerDir = URL(fileURLWithPath: tokenizerPath)
+        // Optional opts with defaults — match Apple's stateful Llama-3.2
+        // conversion conventions (snake_case, matching HF / PyTorch).
+        let inputName = (opts["coremlInputName"] as? String) ?? "input_ids"
+        let causalMaskName = (opts["coremlCausalMaskName"] as? String) ?? "causal_mask"
+        let outputName = (opts["coremlOutputName"] as? String) ?? "logits"
+        let maxContextTokens = (opts["contextSize"] as? Int) ?? 2048
+        let temperature = (opts["temperature"] as? Double).map(Float.init) ?? 0.0
+        let topP = (opts["topP"] as? Double).map(Float.init) ?? 1.0
+        let topK = (opts["topK"] as? Int) ?? 0
+        let maxNewTokens = (opts["maxNewTokens"] as? Int) ?? 512
+        let httpBasePort = (opts["httpBasePort"] as? Int) ?? 38883
+        let httpMaxPortAttempts = (opts["httpMaxPortAttempts"] as? Int) ?? 16
+        // Load tokenizer first — its eosTokenId is needed by the engine.
+        let tokenizer = try await CoreMLTokenizer(tokenizerDir: tokenizerDir)
+        let engine = try CoreMLEngine(
+            modelURL: modelURL,
+            inputName: inputName,
+            causalMaskName: causalMaskName,
+            outputName: outputName,
+            maxContextTokens: maxContextTokens,
+            eosTokenId: tokenizer.eosTokenId
+        )
+        let sampler = CoreMLSampler(temperature: temperature, topP: topP, topK: topK)
+        let gen = CoreMLGenerator(
+            engine: engine,
+            tokenizer: tokenizer,
+            sampler: sampler,
+            maxNewTokens: maxNewTokens
+        )
+        let modelIdValue = modelURL.deletingPathExtension().lastPathComponent
+        let handlers = CoreMLHandlers(generator: gen, modelId: modelIdValue)
+        // Build context + cors first, install routes, THEN bind —
+        // Hummingbird requires routes at Application construction time
+        // so the install → bind order is mandatory.
+        let ctx = HandlerContext(modelId: modelIdValue, backendName: "coreml")
+        // Note: plan used DispatchConfig which doesn't exist in DVAILlamaCore.
+        // Real type is CORSConfig (public). parseCors() below maps opts → CORSConfig.
+        let corsConfig = parseCors(opts["corsOrigin"])
+        let server = HttpServer()
+        await server.installRoutes(handlers: handlers, ctx: ctx, corsConfig: corsConfig)
+        let boundPort = try await server.tryBind(
+            basePort: httpBasePort,
+            maxAttempts: httpMaxPortAttempts,
+            host: "127.0.0.1"
+        )
+        self.httpServer = server
+        self.generator = gen
+        self.modelId = modelIdValue
+        self.port = boundPort
+        self.baseUrl = "http://127.0.0.1:\(boundPort)/v1"
+        self.isRunning = true
+        return [
+            "baseUrl": self.baseUrl!,
+            "port": boundPort,
+            "backend": "coreml",
+            "modelId": modelIdValue,
+        ]
+    }
+    public func stop() async throws {
+        await httpServer?.stop()
+        httpServer = nil
+        generator = nil
+        modelId = ""
+        baseUrl = nil
+        port = nil
+        isRunning = false
+    }
+    public func statusInfo() -> [String: Any] {
+        var dict: [String: Any] = ["running": isRunning]
+        if let baseUrl = baseUrl { dict["baseUrl"] = baseUrl }
+        if isRunning { dict["backend"] = "coreml" }
+        return dict
+    }
+    // MARK: - Private
+    private func parseCors(_ raw: Any?) -> CORSConfig {
+        if let s = raw as? String { return s == "*" ? .wildcard : .exact(s) }
+        if let arr = raw as? [String] { return .allowlist(arr) }
+        return .wildcard
+    }
+}

package/ios/Sources/DVAICoreMLCore/Internal/CoreMLEngine.swift CHANGED Viewed

@@ -1,137 +1,137 @@
-import Foundation
-import CoreML
-/// Wraps an `MLModel` plus the shape conventions our CoreML LLM checkpoints
-/// follow. `makeConversationState()` produces a fresh `MLState` for each
-/// conversation so token-by-token decoding can preserve KV-cache across calls.
-///
-/// iOS 18 / macOS 15 API notes:
-///   - `MLModel.makeState()` returns `MLState` (non-optional, throws is not in
-///     the signature — it can still crash at runtime on non-stateful models).
-///   - `MLModel.prediction(from:using:options:)` takes state via the `using:`
-///     label, NOT `state:`. Verified against Apple's CoreML docs.
-@available(iOS 18.0, macOS 15.0, *)
-internal final class CoreMLEngine: @unchecked Sendable {
-    let model: MLModel
-    /// Name of the token-id input feature. Apple-converted Llama-3.2 stateful
-    /// checkpoints use `input_ids` (snake_case, matching HF / PyTorch
-    /// convention). Override via `opts["coremlInputName"]` for non-standard
-    /// checkpoints.
-    let inputName: String
-    /// Name of the causal-mask input feature. Apple-converted stateful
-    /// checkpoints declare a `causal_mask` Float16 multiarray of shape
-    /// `[1, 1, q_len, kv_len]` — the model uses it inside
-    /// `Ios18.scaledDotProductAttention`. Empty string disables the
-    /// causal-mask input (for older or simpler checkpoints that don't
-    /// declare it). Override via `opts["coremlCausalMaskName"]`.
-    let causalMaskName: String
-    let outputName: String      // default: "logits"
-    let maxContextTokens: Int   // from opts; default 2048
-    let eosTokenId: Int         // from tokenizer or opts
-    init(
-        modelURL: URL,
-        inputName: String = "input_ids",
-        causalMaskName: String = "causal_mask",
-        outputName: String = "logits",
-        maxContextTokens: Int = 2048,
-        eosTokenId: Int,
-        computeUnits: MLComputeUnits = .all
-    ) throws {
-        let cfg = MLModelConfiguration()
-        cfg.computeUnits = computeUnits
-        do {
-            self.model = try MLModel(contentsOf: modelURL, configuration: cfg)
-        } catch {
-            throw CoreMLBackendError.modelLoadFailed(reason: "\(error)")
-        }
-        self.inputName = inputName
-        self.causalMaskName = causalMaskName
-        self.outputName = outputName
-        self.maxContextTokens = maxContextTokens
-        self.eosTokenId = eosTokenId
-    }
-    /// Make a fresh KV-cache state for a new conversation.
-    /// Wraps `MLModel.makeState()` (iOS 18 / macOS 15).
-    /// Note: `makeState()` is NOT throwing in Apple's API; it returns `MLState`
-    /// directly. Non-stateful models will produce a state object that has no
-    /// effect — they won't crash here, but predictions will behave as if
-    /// stateless. Real validation happens at prediction time.
-    func makeConversationState() -> MLState {
-        // matches MLModel.makeState() iOS 18 non-throwing signature
-        return model.makeState()
-    }
-    /// Run a single-token forward pass using the given KV-cache state.
-    ///
-    /// Uses `MLModel.prediction(from:using:options:)` — the `using:` label
-    /// carries the `MLState` object (not `state:`). Verified against Apple docs.
-    ///
-    /// - Parameters:
-    ///   - token: New token id to feed (the K/V is appended to `state` by
-    ///     the model's `Ios18.writeState` op as a side-effect).
-    ///   - kvCachePosition: 0-based position of the new token in the
-    ///     conversation. The first prompt token is position 0, second is 1,
-    ///     etc. Used to size the causal-mask input. Caller increments this
-    ///     across runStep calls within the same conversation.
-    ///   - state: KV-cache `MLState` from `makeConversationState()`.
-    func runStep(token: Int, kvCachePosition: Int, state: MLState) throws -> MLMultiArray {
-        var features: [String: MLFeatureValue] = [:]
-        // input_ids: [1, 1] Int32 with the new token. Direct memory write
-        // (rather than NSNumber subscript) matches Apple's documented
-        // pattern for primitive multiarray data and avoids unnecessary
-        // bridging overhead.
-        //
-        // KNOWN ISSUE: on the reference Apple-converted Llama-3.2 stateful
-        // 4-bit checkpoint, the FIRST `model.prediction(from:using:)` call
-        // crashes hard inside CoreML's C++ IR layer with:
-        //
-        //   Error: Cannot retrieve vector from IRValue format int32
-        //
-        // The crash is reproducible on BOTH iOS Simulator and macOS-native,
-        // which rules out the previously-suspected simulator-only Espresso
-        // limitation. Verified that:
-        //   - Model loads fine (no "Failed to build execution plan").
-        //   - input_ids name + shape match the model description.
-        //   - causal_mask name + shape match Apple's published convention.
-        //
-        // The error manifests as a process crash (xctest exits unexpectedly,
-        // not a Swift Error throw), so callers can't try/catch it. The
-        // RealModelIntegrationTest gates the test off until the cause is
-        // understood. Live debugging on a real iOS device with Instruments
-        // is the next step. See:
-        //   packages/dvai-bridge-ios/ios/Tests/DVAIBridgeTests/RealModelIntegrationTest.swift
-        let inputArr = try MLMultiArray(shape: [1, 1], dataType: .int32)
-        inputArr.dataPointer.bindMemory(to: Int32.self, capacity: 1).pointee = Int32(token)
-        features[inputName] = MLFeatureValue(multiArray: inputArr)
-        // causal_mask: [1, 1, 1, kvCachePosition+1] Float16, all zeros.
-        //
-        // For autoregressive single-token decoding the new query attends to
-        // every K/V position seen so far (0..kvCachePosition inclusive), so
-        // the mask is all-zeros (zero = unmasked, large-negative = masked).
-        // Apple's stateful Llama-3.2 checkpoints declare this input as
-        // Float16 with shape flexibility `[1, 1, 1...2048, 1...2048]`; we
-        // produce the minimal slice for the current step.
-        if !causalMaskName.isEmpty,
-           model.modelDescription.inputDescriptionsByName[causalMaskName] != nil
-        {
-            let kvLen = max(1, kvCachePosition + 1)
-            let mask = try MLMultiArray(shape: [1, 1, 1, NSNumber(value: kvLen)], dataType: .float16)
-            // Float16 zero == bit pattern 0x0000, so memset(0) suffices.
-            memset(mask.dataPointer, 0, mask.count * MemoryLayout<UInt16>.size)
-            features[causalMaskName] = MLFeatureValue(multiArray: mask)
-        }
-        let input = try MLDictionaryFeatureProvider(dictionary: features)
-        // `prediction(from:using:options:)` is synchronous in Apple's CoreML iOS 18 API.
-        // Wrapped in CoreMLGenerator via async Task to avoid blocking the caller's thread.
-        let output = try model.prediction(from: input, using: state, options: MLPredictionOptions())
-        guard let logits = output.featureValue(for: outputName)?.multiArrayValue else {
-            throw CoreMLBackendError.generationFailed(reason: "no '\(outputName)' output in model prediction")
-        }
-        return logits
-    }
-}
+import Foundation
+import CoreML
+/// Wraps an `MLModel` plus the shape conventions our CoreML LLM checkpoints
+/// follow. `makeConversationState()` produces a fresh `MLState` for each
+/// conversation so token-by-token decoding can preserve KV-cache across calls.
+///
+/// iOS 18 / macOS 15 API notes:
+///   - `MLModel.makeState()` returns `MLState` (non-optional, throws is not in
+///     the signature — it can still crash at runtime on non-stateful models).
+///   - `MLModel.prediction(from:using:options:)` takes state via the `using:`
+///     label, NOT `state:`. Verified against Apple's CoreML docs.
+@available(iOS 18.0, macOS 15.0, *)
+internal final class CoreMLEngine: @unchecked Sendable {
+    let model: MLModel
+    /// Name of the token-id input feature. Apple-converted Llama-3.2 stateful
+    /// checkpoints use `input_ids` (snake_case, matching HF / PyTorch
+    /// convention). Override via `opts["coremlInputName"]` for non-standard
+    /// checkpoints.
+    let inputName: String
+    /// Name of the causal-mask input feature. Apple-converted stateful
+    /// checkpoints declare a `causal_mask` Float16 multiarray of shape
+    /// `[1, 1, q_len, kv_len]` — the model uses it inside
+    /// `Ios18.scaledDotProductAttention`. Empty string disables the
+    /// causal-mask input (for older or simpler checkpoints that don't
+    /// declare it). Override via `opts["coremlCausalMaskName"]`.
+    let causalMaskName: String
+    let outputName: String      // default: "logits"
+    let maxContextTokens: Int   // from opts; default 2048
+    let eosTokenId: Int         // from tokenizer or opts
+    init(
+        modelURL: URL,
+        inputName: String = "input_ids",
+        causalMaskName: String = "causal_mask",
+        outputName: String = "logits",
+        maxContextTokens: Int = 2048,
+        eosTokenId: Int,
+        computeUnits: MLComputeUnits = .all
+    ) throws {
+        let cfg = MLModelConfiguration()
+        cfg.computeUnits = computeUnits
+        do {
+            self.model = try MLModel(contentsOf: modelURL, configuration: cfg)
+        } catch {
+            throw CoreMLBackendError.modelLoadFailed(reason: "\(error)")
+        }
+        self.inputName = inputName
+        self.causalMaskName = causalMaskName
+        self.outputName = outputName
+        self.maxContextTokens = maxContextTokens
+        self.eosTokenId = eosTokenId
+    }
+    /// Make a fresh KV-cache state for a new conversation.
+    /// Wraps `MLModel.makeState()` (iOS 18 / macOS 15).
+    /// Note: `makeState()` is NOT throwing in Apple's API; it returns `MLState`
+    /// directly. Non-stateful models will produce a state object that has no
+    /// effect — they won't crash here, but predictions will behave as if
+    /// stateless. Real validation happens at prediction time.
+    func makeConversationState() -> MLState {
+        // matches MLModel.makeState() iOS 18 non-throwing signature
+        return model.makeState()
+    }
+    /// Run a single-token forward pass using the given KV-cache state.
+    ///
+    /// Uses `MLModel.prediction(from:using:options:)` — the `using:` label
+    /// carries the `MLState` object (not `state:`). Verified against Apple docs.
+    ///
+    /// - Parameters:
+    ///   - token: New token id to feed (the K/V is appended to `state` by
+    ///     the model's `Ios18.writeState` op as a side-effect).
+    ///   - kvCachePosition: 0-based position of the new token in the
+    ///     conversation. The first prompt token is position 0, second is 1,
+    ///     etc. Used to size the causal-mask input. Caller increments this
+    ///     across runStep calls within the same conversation.
+    ///   - state: KV-cache `MLState` from `makeConversationState()`.
+    func runStep(token: Int, kvCachePosition: Int, state: MLState) throws -> MLMultiArray {
+        var features: [String: MLFeatureValue] = [:]
+        // input_ids: [1, 1] Int32 with the new token. Direct memory write
+        // (rather than NSNumber subscript) matches Apple's documented
+        // pattern for primitive multiarray data and avoids unnecessary
+        // bridging overhead.
+        //
+        // KNOWN ISSUE: on the reference Apple-converted Llama-3.2 stateful
+        // 4-bit checkpoint, the FIRST `model.prediction(from:using:)` call
+        // crashes hard inside CoreML's C++ IR layer with:
+        //
+        //   Error: Cannot retrieve vector from IRValue format int32
+        //
+        // The crash is reproducible on BOTH iOS Simulator and macOS-native,
+        // which rules out the previously-suspected simulator-only Espresso
+        // limitation. Verified that:
+        //   - Model loads fine (no "Failed to build execution plan").
+        //   - input_ids name + shape match the model description.
+        //   - causal_mask name + shape match Apple's published convention.
+        //
+        // The error manifests as a process crash (xctest exits unexpectedly,
+        // not a Swift Error throw), so callers can't try/catch it. The
+        // RealModelIntegrationTest gates the test off until the cause is
+        // understood. Live debugging on a real iOS device with Instruments
+        // is the next step. See:
+        //   packages/dvai-bridge-ios/ios/Tests/DVAIBridgeTests/RealModelIntegrationTest.swift
+        let inputArr = try MLMultiArray(shape: [1, 1], dataType: .int32)
+        inputArr.dataPointer.bindMemory(to: Int32.self, capacity: 1).pointee = Int32(token)
+        features[inputName] = MLFeatureValue(multiArray: inputArr)
+        // causal_mask: [1, 1, 1, kvCachePosition+1] Float16, all zeros.
+        //
+        // For autoregressive single-token decoding the new query attends to
+        // every K/V position seen so far (0..kvCachePosition inclusive), so
+        // the mask is all-zeros (zero = unmasked, large-negative = masked).
+        // Apple's stateful Llama-3.2 checkpoints declare this input as
+        // Float16 with shape flexibility `[1, 1, 1...2048, 1...2048]`; we
+        // produce the minimal slice for the current step.
+        if !causalMaskName.isEmpty,
+           model.modelDescription.inputDescriptionsByName[causalMaskName] != nil
+        {
+            let kvLen = max(1, kvCachePosition + 1)
+            let mask = try MLMultiArray(shape: [1, 1, 1, NSNumber(value: kvLen)], dataType: .float16)
+            // Float16 zero == bit pattern 0x0000, so memset(0) suffices.
+            memset(mask.dataPointer, 0, mask.count * MemoryLayout<UInt16>.size)
+            features[causalMaskName] = MLFeatureValue(multiArray: mask)
+        }
+        let input = try MLDictionaryFeatureProvider(dictionary: features)
+        // `prediction(from:using:options:)` is synchronous in Apple's CoreML iOS 18 API.
+        // Wrapped in CoreMLGenerator via async Task to avoid blocking the caller's thread.
+        let output = try model.prediction(from: input, using: state, options: MLPredictionOptions())
+        guard let logits = output.featureValue(for: outputName)?.multiArrayValue else {
+            throw CoreMLBackendError.generationFailed(reason: "no '\(outputName)' output in model prediction")
+        }
+        return logits
+    }
+}