npm - @dvai-bridge/ios - Versions diffs - 4.0.0 - Mend

@dvai-bridge/ios 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

package/ios/Sources/DVAIBridge/ReactiveState.swift ADDED Viewed

@@ -0,0 +1,149 @@
+import Foundation
+import Combine
+/// SwiftUI-friendly reactive state. Exposes lifecycle and progress as
+/// observable properties on the main actor.
+///
+/// ## Distribution-channel asymmetry
+///
+/// - **Under SwiftPM** (`Package.swift`): full `ObservableObject` +
+///   `@Published` API. Drop into a SwiftUI view as `@StateObject` /
+///   `@ObservedObject` and the view re-renders automatically when any
+///   property changes.
+/// - **Under CocoaPods** (`DVAIBridge.podspec`): `ObservableObject`
+///   conformance and the `@Published` wrappers are intentionally OMITTED.
+///   The properties remain `public private(set) var` and are still
+///   readable; observers must subscribe to `stateChanges` (the always-
+///   available `Combine` publisher below) instead of using SwiftUI's
+///   property-wrapper integration.
+///
+/// **Why the asymmetry?** Xcode 26 / iOS 26 SDK's static linker emits
+/// an implicit link directive for `SwiftUICore` (a private framework
+/// non-Apple products cannot link) for *any* module that conforms a type
+/// to `ObservableObject` — even if the module never imports SwiftUI.
+/// Linking `SwiftUICore` from a non-Apple framework fails with
+/// "cannot link directly with 'SwiftUICore' because product being built
+/// is not an allowed client of it". CocoaPods bundles all of dvai-bridge
+/// into a single Swift module, so the trigger lands on every consumer's
+/// link line. SwiftPM, by contrast, builds dvai-bridge as a library
+/// dynamically resolved at the consumer's link line where SwiftUICore
+/// access *is* allowed (because the consumer's app IS an allowed client),
+/// so the same conformance compiles fine.
+///
+/// CocoaPods SwiftUI consumers wanting reactive view updates should:
+///
+///     @State private var snapshot = DVAIBridgeSnapshot()
+///     ...
+///     .onReceive(DVAIBridge.shared.reactive.stateChanges) { _ in
+///         snapshot = DVAIBridgeSnapshot.from(DVAIBridge.shared.reactive)
+///     }
+///
+/// Or wrap the reactive object in a small SwiftUI-side adapter that
+/// conforms to `ObservableObject` themselves (since their app target
+/// IS an allowed SwiftUICore client).
+@MainActor
+public final class DVAIBridgeReactiveState {
+    #if COCOAPODS
+    public private(set) var isReady: Bool = false {
+        didSet { stateChangesSubject.send() }
+    }
+    public private(set) var baseUrl: String? = nil {
+        didSet { stateChangesSubject.send() }
+    }
+    public private(set) var port: Int? = nil {
+        didSet { stateChangesSubject.send() }
+    }
+    public private(set) var currentBackend: BackendKind? = nil {
+        didSet { stateChangesSubject.send() }
+    }
+    public private(set) var lastProgress: ProgressEvent? = nil {
+        didSet { stateChangesSubject.send() }
+    }
+    #else
+    @Published public private(set) var isReady: Bool = false
+    @Published public private(set) var baseUrl: String? = nil
+    @Published public private(set) var port: Int? = nil
+    @Published public private(set) var currentBackend: BackendKind? = nil
+    @Published public private(set) var lastProgress: ProgressEvent? = nil
+    #endif
+    private let stateChangesSubject = PassthroughSubject<Void, Never>()
+    /// Combine publisher that fires whenever any of the state properties
+    /// changes. Available in both SwiftPM and CocoaPods builds — SwiftPM
+    /// consumers usually use `ObservableObject` directly via SwiftUI's
+    /// property wrappers, but this publisher remains available as a
+    /// non-SwiftUI alternative.
+    public var stateChanges: AnyPublisher<Void, Never> {
+        stateChangesSubject.eraseToAnyPublisher()
+    }
+    internal init() {}
+    internal func didStart(_ server: BoundServer) {
+        isReady = true
+        baseUrl = server.baseUrl
+        port = server.port
+        currentBackend = server.backend
+        #if !COCOAPODS
+        // Under SwiftPM the @Published wrappers handle change publishing
+        // automatically; we still emit on stateChangesSubject so non-SwiftUI
+        // observers (e.g. UIKit code paths) can subscribe to it uniformly.
+        stateChangesSubject.send()
+        #endif
+    }
+    internal func didStop() {
+        isReady = false
+        baseUrl = nil
+        port = nil
+        currentBackend = nil
+        #if !COCOAPODS
+        stateChangesSubject.send()
+        #endif
+    }
+    internal func didReceiveProgress(_ event: ProgressEvent) {
+        lastProgress = event
+        #if !COCOAPODS
+        stateChangesSubject.send()
+        #endif
+    }
+}
+#if !COCOAPODS
+extension DVAIBridgeReactiveState: ObservableObject {}
+#endif
+extension DVAIBridge {
+    /// Main-actor-isolated reactive state. Subsequent accesses return the
+    /// same object — under SwiftPM, pin it as `@StateObject` upstream;
+    /// under CocoaPods, observe the `stateChanges` publisher.
+    @MainActor
+    public var reactive: DVAIBridgeReactiveState {
+        DVAIBridgeReactiveStateRegistry.shared.state(for: self)
+    }
+}
+/// Per-DVAIBridge-instance registry of ReactiveState objects. Actors can't
+/// own MainActor-isolated state directly, so the registry lives on the
+/// MainActor and keys by `ObjectIdentifier(bridge)`.
+@MainActor
+internal final class DVAIBridgeReactiveStateRegistry {
+    static let shared = DVAIBridgeReactiveStateRegistry()
+    private var states: [ObjectIdentifier: DVAIBridgeReactiveState] = [:]
+    func state(for bridge: DVAIBridge) -> DVAIBridgeReactiveState {
+        let id = ObjectIdentifier(bridge)
+        if let existing = states[id] { return existing }
+        let new = DVAIBridgeReactiveState()
+        states[id] = new
+        // Forward all progress events into the state on the main actor.
+        Task { @MainActor [weak new] in
+            for await event in bridge.progressStream {
+                new?.didReceiveProgress(event)
+            }
+        }
+        return new
+    }
+}

package/ios/Sources/DVAICoreMLCore/.gitkeep ADDED Viewed

File without changes

package/ios/Sources/DVAICoreMLCore/CoreMLBackendError.swift ADDED Viewed

@@ -0,0 +1,19 @@
+import Foundation
+public enum CoreMLBackendError: Error, LocalizedError, Sendable {
+    case modelLoadFailed(reason: String)
+    case tokenizerLoadFailed(reason: String)
+    case stateInitFailed(reason: String)
+    case generationFailed(reason: String)
+    case unsupportedModelFormat(reason: String)
+    public var errorDescription: String? {
+        switch self {
+        case .modelLoadFailed(let r): return "CoreML model load failed: \(r)"
+        case .tokenizerLoadFailed(let r): return "Tokenizer load failed: \(r)"
+        case .stateInitFailed(let r): return "MLState init failed: \(r)"
+        case .generationFailed(let r): return "Generation failed: \(r)"
+        case .unsupportedModelFormat(let r): return "Unsupported model format: \(r)"
+        }
+    }
+}

package/ios/Sources/DVAICoreMLCore/CoreMLHandlers.swift ADDED Viewed

@@ -0,0 +1,123 @@
+import Foundation
+#if !COCOAPODS
+import DVAISharedCore
+#endif
+/// `DVAIHandlers` conformer for the CoreML backend.
+/// Translates OpenAI-compatible HTTP requests into CoreMLGenerator calls and
+/// formats the results as OpenAI JSON / SSE responses.
+@available(iOS 18.0, macOS 15.0, *)
+public final class CoreMLHandlers: DVAIHandlers {
+    private let generator: CoreMLGenerator
+    private let modelId: String
+    // Internal init — `CoreMLGenerator` is an implementation detail of
+    // DVAICoreMLCore and stays internal. The only construction site is
+    // `CoreMLPluginState.start()` inside the same module.
+    internal init(generator: CoreMLGenerator, modelId: String) {
+        self.generator = generator
+        self.modelId = modelId
+    }
+    public func handleChatCompletion(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
+        guard let messages = body["messages"] as? [[String: String]] else {
+            return .error(400, "messages array is required")
+        }
+        let stream = (body["stream"] as? Bool) ?? false
+        let temperature = (body["temperature"] as? Double).map(Float.init) ?? 0.0
+        let topP = (body["top_p"] as? Double).map(Float.init) ?? 1.0
+        let maxTokens = (body["max_tokens"] as? Int) ?? 512
+        // Build a generator with the per-request sampling params.
+        let requestSampler = CoreMLSampler(temperature: temperature, topP: topP, topK: 0)
+        let requestGenerator = CoreMLGenerator(
+            engine: generator.engine,
+            tokenizer: generator.tokenizer,
+            sampler: requestSampler,
+            maxNewTokens: maxTokens
+        )
+        let promptTokens: [Int]
+        do {
+            promptTokens = try generator.tokenizer.applyChatTemplate(messages: messages)
+        } catch {
+            return .error(400, "tokenizer chat-template failed: \(error.localizedDescription)")
+        }
+        if stream {
+            let sse = requestGenerator.generateStream(promptTokens: promptTokens)
+            let streamId = UUID().uuidString
+            let mappedStream = AsyncStream<String> { cont in
+                Task {
+                    do {
+                        for try await chunk in sse {
+                            let evt = "data: {\"id\":\"\(streamId)\",\"object\":\"chat.completion.chunk\",\"created\":\(Int(Date().timeIntervalSince1970)),\"model\":\"\(modelId)\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\(jsonString(chunk))},\"finish_reason\":null}]}\n\n"
+                            cont.yield(evt)
+                        }
+                        cont.yield("data: [DONE]\n\n")
+                        cont.finish()
+                    } catch {
+                        cont.yield("data: {\"error\":\"\(error.localizedDescription)\"}\n\n")
+                        cont.finish()
+                    }
+                }
+            }
+            return .sse(mappedStream)
+        }
+        let text: String
+        do {
+            text = try await requestGenerator.generate(promptTokens: promptTokens)
+        } catch {
+            return .error(500, "generation failed: \(error.localizedDescription)")
+        }
+        let responseJSON: [String: Any] = [
+            "id": UUID().uuidString,
+            "object": "chat.completion",
+            "created": Int(Date().timeIntervalSince1970),
+            "model": modelId,
+            "choices": [[
+                "index": 0,
+                "message": ["role": "assistant", "content": text],
+                "finish_reason": "stop"
+            ]],
+            "usage": [
+                "prompt_tokens": promptTokens.count,
+                "completion_tokens": -1,  // CoreML decoding doesn't track this per checkpoint
+                "total_tokens": -1
+            ]
+        ]
+        return .json(200, responseJSON)
+    }
+    public func handleCompletion(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
+        let prompt = body["prompt"] as? String ?? ""
+        let chatBody: [String: Any] = [
+            "messages": [["role": "user", "content": prompt]],
+            "stream": body["stream"] as? Bool ?? false,
+            "temperature": body["temperature"] as? Double ?? 0.0,
+            "top_p": body["top_p"] as? Double ?? 1.0,
+            "max_tokens": body["max_tokens"] as? Int ?? 512,
+        ]
+        return try await handleChatCompletion(body: chatBody, ctx: ctx)
+    }
+    public func handleEmbeddings(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
+        return .error(501, "embeddings not yet supported by the CoreML backend")
+    }
+    public func handleModels(ctx: HandlerContext) async throws -> HandlerResponse {
+        return .json(200, [
+            "object": "list",
+            "data": [["id": modelId, "object": "model", "owned_by": "dvai-bridge"]]
+        ])
+    }
+    /// JSON-encode a single string value (produces a quoted JSON string).
+    private func jsonString(_ s: String) -> String {
+        let data = (try? JSONSerialization.data(withJSONObject: [s], options: [])) ?? Data()
+        let str = String(data: data, encoding: .utf8) ?? "[\"\"]"
+        // Strip the surrounding array brackets — leaves the quoted string value.
+        return String(str.dropFirst().dropLast())
+    }
+}

package/ios/Sources/DVAICoreMLCore/CoreMLPluginState.swift ADDED Viewed

@@ -0,0 +1,130 @@
+import Foundation
+import CoreML
+#if !COCOAPODS
+import DVAISharedCore   // HttpServer, DVAIHandlers, HandlerContext, CORSConfig
+#endif
+/// Public PluginState mirroring DVAILlamaCore.PluginState's shape.
+/// Boots a Telegraph HTTP server on `127.0.0.1:<port>` (with port-fallback),
+/// loads the .mlmodelc model + tokenizer, and serves OpenAI-compatible
+/// requests via CoreMLHandlers.
+///
+/// Requires iOS 18 / macOS 15 for MLState (KV-cache stateful decoding).
+@available(iOS 18.0, macOS 15.0, *)
+public actor CoreMLPluginState {
+    private var httpServer: HttpServer?
+    private var generator: CoreMLGenerator?
+    private var modelId: String = ""
+    private var isRunning: Bool = false
+    private var baseUrl: String?
+    private var port: Int?
+    public init() {}
+    public func start(opts: [String: Any]) async throws -> [String: Any] {
+        if isRunning { try await stop() }
+        guard let modelPath = opts["modelPath"] as? String, !modelPath.isEmpty else {
+            throw CoreMLBackendError.modelLoadFailed(
+                reason: "modelPath is required for the CoreML backend")
+        }
+        guard let tokenizerPath = opts["tokenizerPath"] as? String, !tokenizerPath.isEmpty else {
+            throw CoreMLBackendError.tokenizerLoadFailed(
+                reason: "tokenizerPath is required (path to a directory containing " +
+                        "tokenizer.json + tokenizer_config.json)")
+        }
+        let modelURL = URL(fileURLWithPath: modelPath)
+        let tokenizerDir = URL(fileURLWithPath: tokenizerPath)
+        // Optional opts with defaults — match Apple's stateful Llama-3.2
+        // conversion conventions (snake_case, matching HF / PyTorch).
+        let inputName = (opts["coremlInputName"] as? String) ?? "input_ids"
+        let causalMaskName = (opts["coremlCausalMaskName"] as? String) ?? "causal_mask"
+        let outputName = (opts["coremlOutputName"] as? String) ?? "logits"
+        let maxContextTokens = (opts["contextSize"] as? Int) ?? 2048
+        let temperature = (opts["temperature"] as? Double).map(Float.init) ?? 0.0
+        let topP = (opts["topP"] as? Double).map(Float.init) ?? 1.0
+        let topK = (opts["topK"] as? Int) ?? 0
+        let maxNewTokens = (opts["maxNewTokens"] as? Int) ?? 512
+        let httpBasePort = (opts["httpBasePort"] as? Int) ?? 38883
+        let httpMaxPortAttempts = (opts["httpMaxPortAttempts"] as? Int) ?? 16
+        // Load tokenizer first — its eosTokenId is needed by the engine.
+        let tokenizer = try await CoreMLTokenizer(tokenizerDir: tokenizerDir)
+        let engine = try CoreMLEngine(
+            modelURL: modelURL,
+            inputName: inputName,
+            causalMaskName: causalMaskName,
+            outputName: outputName,
+            maxContextTokens: maxContextTokens,
+            eosTokenId: tokenizer.eosTokenId
+        )
+        let sampler = CoreMLSampler(temperature: temperature, topP: topP, topK: topK)
+        let gen = CoreMLGenerator(
+            engine: engine,
+            tokenizer: tokenizer,
+            sampler: sampler,
+            maxNewTokens: maxNewTokens
+        )
+        let modelIdValue = modelURL.deletingPathExtension().lastPathComponent
+        let handlers = CoreMLHandlers(generator: gen, modelId: modelIdValue)
+        // Build context + cors first, install routes, THEN bind —
+        // Hummingbird requires routes at Application construction time
+        // so the install → bind order is mandatory.
+        let ctx = HandlerContext(modelId: modelIdValue, backendName: "coreml")
+        // Note: plan used DispatchConfig which doesn't exist in DVAILlamaCore.
+        // Real type is CORSConfig (public). parseCors() below maps opts → CORSConfig.
+        let corsConfig = parseCors(opts["corsOrigin"])
+        let server = HttpServer()
+        await server.installRoutes(handlers: handlers, ctx: ctx, corsConfig: corsConfig)
+        let boundPort = try await server.tryBind(
+            basePort: httpBasePort,
+            maxAttempts: httpMaxPortAttempts,
+            host: "127.0.0.1"
+        )
+        self.httpServer = server
+        self.generator = gen
+        self.modelId = modelIdValue
+        self.port = boundPort
+        self.baseUrl = "http://127.0.0.1:\(boundPort)/v1"
+        self.isRunning = true
+        return [
+            "baseUrl": self.baseUrl!,
+            "port": boundPort,
+            "backend": "coreml",
+            "modelId": modelIdValue,
+        ]
+    }
+    public func stop() async throws {
+        await httpServer?.stop()
+        httpServer = nil
+        generator = nil
+        modelId = ""
+        baseUrl = nil
+        port = nil
+        isRunning = false
+    }
+    public func statusInfo() -> [String: Any] {
+        var dict: [String: Any] = ["running": isRunning]
+        if let baseUrl = baseUrl { dict["baseUrl"] = baseUrl }
+        if isRunning { dict["backend"] = "coreml" }
+        return dict
+    }
+    // MARK: - Private
+    private func parseCors(_ raw: Any?) -> CORSConfig {
+        if let s = raw as? String { return s == "*" ? .wildcard : .exact(s) }
+        if let arr = raw as? [String] { return .allowlist(arr) }
+        return .wildcard
+    }
+}

package/ios/Sources/DVAICoreMLCore/Internal/CoreMLEngine.swift ADDED Viewed

@@ -0,0 +1,137 @@
+import Foundation
+import CoreML
+/// Wraps an `MLModel` plus the shape conventions our CoreML LLM checkpoints
+/// follow. `makeConversationState()` produces a fresh `MLState` for each
+/// conversation so token-by-token decoding can preserve KV-cache across calls.
+///
+/// iOS 18 / macOS 15 API notes:
+///   - `MLModel.makeState()` returns `MLState` (non-optional, throws is not in
+///     the signature — it can still crash at runtime on non-stateful models).
+///   - `MLModel.prediction(from:using:options:)` takes state via the `using:`
+///     label, NOT `state:`. Verified against Apple's CoreML docs.
+@available(iOS 18.0, macOS 15.0, *)
+internal final class CoreMLEngine: @unchecked Sendable {
+    let model: MLModel
+    /// Name of the token-id input feature. Apple-converted Llama-3.2 stateful
+    /// checkpoints use `input_ids` (snake_case, matching HF / PyTorch
+    /// convention). Override via `opts["coremlInputName"]` for non-standard
+    /// checkpoints.
+    let inputName: String
+    /// Name of the causal-mask input feature. Apple-converted stateful
+    /// checkpoints declare a `causal_mask` Float16 multiarray of shape
+    /// `[1, 1, q_len, kv_len]` — the model uses it inside
+    /// `Ios18.scaledDotProductAttention`. Empty string disables the
+    /// causal-mask input (for older or simpler checkpoints that don't
+    /// declare it). Override via `opts["coremlCausalMaskName"]`.
+    let causalMaskName: String
+    let outputName: String      // default: "logits"
+    let maxContextTokens: Int   // from opts; default 2048
+    let eosTokenId: Int         // from tokenizer or opts
+    init(
+        modelURL: URL,
+        inputName: String = "input_ids",
+        causalMaskName: String = "causal_mask",
+        outputName: String = "logits",
+        maxContextTokens: Int = 2048,
+        eosTokenId: Int,
+        computeUnits: MLComputeUnits = .all
+    ) throws {
+        let cfg = MLModelConfiguration()
+        cfg.computeUnits = computeUnits
+        do {
+            self.model = try MLModel(contentsOf: modelURL, configuration: cfg)
+        } catch {
+            throw CoreMLBackendError.modelLoadFailed(reason: "\(error)")
+        }
+        self.inputName = inputName
+        self.causalMaskName = causalMaskName
+        self.outputName = outputName
+        self.maxContextTokens = maxContextTokens
+        self.eosTokenId = eosTokenId
+    }
+    /// Make a fresh KV-cache state for a new conversation.
+    /// Wraps `MLModel.makeState()` (iOS 18 / macOS 15).
+    /// Note: `makeState()` is NOT throwing in Apple's API; it returns `MLState`
+    /// directly. Non-stateful models will produce a state object that has no
+    /// effect — they won't crash here, but predictions will behave as if
+    /// stateless. Real validation happens at prediction time.
+    func makeConversationState() -> MLState {
+        // matches MLModel.makeState() iOS 18 non-throwing signature
+        return model.makeState()
+    }
+    /// Run a single-token forward pass using the given KV-cache state.
+    ///
+    /// Uses `MLModel.prediction(from:using:options:)` — the `using:` label
+    /// carries the `MLState` object (not `state:`). Verified against Apple docs.
+    ///
+    /// - Parameters:
+    ///   - token: New token id to feed (the K/V is appended to `state` by
+    ///     the model's `Ios18.writeState` op as a side-effect).
+    ///   - kvCachePosition: 0-based position of the new token in the
+    ///     conversation. The first prompt token is position 0, second is 1,
+    ///     etc. Used to size the causal-mask input. Caller increments this
+    ///     across runStep calls within the same conversation.
+    ///   - state: KV-cache `MLState` from `makeConversationState()`.
+    func runStep(token: Int, kvCachePosition: Int, state: MLState) throws -> MLMultiArray {
+        var features: [String: MLFeatureValue] = [:]
+        // input_ids: [1, 1] Int32 with the new token. Direct memory write
+        // (rather than NSNumber subscript) matches Apple's documented
+        // pattern for primitive multiarray data and avoids unnecessary
+        // bridging overhead.
+        //
+        // KNOWN ISSUE: on the reference Apple-converted Llama-3.2 stateful
+        // 4-bit checkpoint, the FIRST `model.prediction(from:using:)` call
+        // crashes hard inside CoreML's C++ IR layer with:
+        //
+        //   Error: Cannot retrieve vector from IRValue format int32
+        //
+        // The crash is reproducible on BOTH iOS Simulator and macOS-native,
+        // which rules out the previously-suspected simulator-only Espresso
+        // limitation. Verified that:
+        //   - Model loads fine (no "Failed to build execution plan").
+        //   - input_ids name + shape match the model description.
+        //   - causal_mask name + shape match Apple's published convention.
+        //
+        // The error manifests as a process crash (xctest exits unexpectedly,
+        // not a Swift Error throw), so callers can't try/catch it. The
+        // RealModelIntegrationTest gates the test off until the cause is
+        // understood. Live debugging on a real iOS device with Instruments
+        // is the next step. See:
+        //   packages/dvai-bridge-ios/ios/Tests/DVAIBridgeTests/RealModelIntegrationTest.swift
+        let inputArr = try MLMultiArray(shape: [1, 1], dataType: .int32)
+        inputArr.dataPointer.bindMemory(to: Int32.self, capacity: 1).pointee = Int32(token)
+        features[inputName] = MLFeatureValue(multiArray: inputArr)
+        // causal_mask: [1, 1, 1, kvCachePosition+1] Float16, all zeros.
+        //
+        // For autoregressive single-token decoding the new query attends to
+        // every K/V position seen so far (0..kvCachePosition inclusive), so
+        // the mask is all-zeros (zero = unmasked, large-negative = masked).
+        // Apple's stateful Llama-3.2 checkpoints declare this input as
+        // Float16 with shape flexibility `[1, 1, 1...2048, 1...2048]`; we
+        // produce the minimal slice for the current step.
+        if !causalMaskName.isEmpty,
+           model.modelDescription.inputDescriptionsByName[causalMaskName] != nil
+        {
+            let kvLen = max(1, kvCachePosition + 1)
+            let mask = try MLMultiArray(shape: [1, 1, 1, NSNumber(value: kvLen)], dataType: .float16)
+            // Float16 zero == bit pattern 0x0000, so memset(0) suffices.
+            memset(mask.dataPointer, 0, mask.count * MemoryLayout<UInt16>.size)
+            features[causalMaskName] = MLFeatureValue(multiArray: mask)
+        }
+        let input = try MLDictionaryFeatureProvider(dictionary: features)
+        // `prediction(from:using:options:)` is synchronous in Apple's CoreML iOS 18 API.
+        // Wrapped in CoreMLGenerator via async Task to avoid blocking the caller's thread.
+        let output = try model.prediction(from: input, using: state, options: MLPredictionOptions())
+        guard let logits = output.featureValue(for: outputName)?.multiArrayValue else {
+            throw CoreMLBackendError.generationFailed(reason: "no '\(outputName)' output in model prediction")
+        }
+        return logits
+    }
+}

package/ios/Sources/DVAICoreMLCore/Internal/CoreMLGenerator.swift ADDED Viewed

@@ -0,0 +1,108 @@
+import Foundation
+import CoreML
+/// Orchestrates `CoreMLEngine` + `CoreMLTokenizer` + `CoreMLSampler` to
+/// produce text from a prompt via autoregressive decoding.
+///
+/// CoreML prediction note (iOS 18):
+///   `MLModel.prediction(from:using:options:)` is synchronous. We wrap the
+///   decode loop in a `Task.detached` (in `generateStream`) or simply call
+///   `runStep` directly in the async context for `generate`. Since `runStep`
+///   is not itself `async`, calling it in an `async` function does NOT suspend
+///   — it runs inline on the current executor. For long-running decodes the
+///   caller should call `generate` / `generateStream` from a background Task
+///   to avoid blocking the main actor.
+@available(iOS 18.0, macOS 15.0, *)
+internal struct CoreMLGenerator: @unchecked Sendable {
+    let engine: CoreMLEngine
+    let tokenizer: CoreMLTokenizer
+    let sampler: CoreMLSampler
+    let maxNewTokens: Int
+    /// Buffered generation. Runs the full decode loop and returns the decoded text.
+    func generate(promptTokens: [Int]) async throws -> String {
+        return try await Task.detached(priority: .userInitiated) {
+            guard !promptTokens.isEmpty else {
+                throw CoreMLBackendError.generationFailed(reason: "prompt tokens are empty")
+            }
+            var generated: [Int] = []
+            let state = self.engine.makeConversationState()
+            // Prefill + decode unified: each runStep returns logits for the
+            // *next* token at position (kvPos+1). After feeding all prompt
+            // tokens, the last logits give us our first generated token.
+            // (Previous iteration of this code re-fed promptTokens.last as a
+            // separate step, which double-counted that token in the KV
+            // cache.)
+            var kvPos = 0
+            var lastLogits: MLMultiArray = try self.engine.runStep(
+                token: promptTokens[0], kvCachePosition: 0, state: state
+            )
+            kvPos = 1
+            for token in promptTokens.dropFirst() {
+                lastLogits = try self.engine.runStep(
+                    token: token, kvCachePosition: kvPos, state: state
+                )
+                kvPos += 1
+            }
+            var nextToken = self.sampler.sample(logits: lastLogits)
+            for _ in 0 ..< self.maxNewTokens {
+                if nextToken == self.engine.eosTokenId { break }
+                generated.append(nextToken)
+                lastLogits = try self.engine.runStep(
+                    token: nextToken, kvCachePosition: kvPos, state: state
+                )
+                kvPos += 1
+                nextToken = self.sampler.sample(logits: lastLogits)
+            }
+            return self.tokenizer.decode(tokens: generated)
+        }.value
+    }
+    /// Streaming generation. Yields each decoded token chunk via `AsyncThrowingStream`.
+    func generateStream(promptTokens: [Int]) -> AsyncThrowingStream<String, Error> {
+        AsyncThrowingStream { continuation in
+            Task.detached(priority: .userInitiated) {
+                do {
+                    guard !promptTokens.isEmpty else {
+                        throw CoreMLBackendError.generationFailed(reason: "prompt tokens are empty")
+                    }
+                    let state = self.engine.makeConversationState()
+                    var kvPos = 0
+                    var lastLogits: MLMultiArray = try self.engine.runStep(
+                        token: promptTokens[0], kvCachePosition: 0, state: state
+                    )
+                    kvPos = 1
+                    for token in promptTokens.dropFirst() {
+                        lastLogits = try self.engine.runStep(
+                            token: token, kvCachePosition: kvPos, state: state
+                        )
+                        kvPos += 1
+                    }
+                    var nextToken = self.sampler.sample(logits: lastLogits)
+                    for _ in 0 ..< self.maxNewTokens {
+                        if nextToken == self.engine.eosTokenId { break }
+                        let chunk = self.tokenizer.decode(token: nextToken)
+                        continuation.yield(chunk)
+                        lastLogits = try self.engine.runStep(
+                            token: nextToken, kvCachePosition: kvPos, state: state
+                        )
+                        kvPos += 1
+                        nextToken = self.sampler.sample(logits: lastLogits)
+                    }
+                    continuation.finish()
+                } catch {
+                    continuation.finish(throwing: error)
+                }
+            }
+        }
+    }
+}