@dvai-bridge/ios 4.0.0 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/Package.swift +104 -104
  2. package/ios/Sources/DVAIBridge/BackendKind.swift +23 -23
  3. package/ios/Sources/DVAIBridge/BoundServer.swift +46 -46
  4. package/ios/Sources/DVAIBridge/DVAIBridge.swift +658 -658
  5. package/ios/Sources/DVAIBridge/DVAIBridgeConfig.swift +86 -86
  6. package/ios/Sources/DVAIBridge/DVAIBridgeError.swift +33 -33
  7. package/ios/Sources/DVAIBridge/Internal/BackendSelector.swift +59 -59
  8. package/ios/Sources/DVAIBridge/Internal/ProgressBroadcaster.swift +84 -84
  9. package/ios/Sources/DVAIBridge/License/Audience.swift +133 -133
  10. package/ios/Sources/DVAIBridge/License/Discovery.swift +164 -164
  11. package/ios/Sources/DVAIBridge/License/LicenseValidator.swift +392 -392
  12. package/ios/Sources/DVAIBridge/License/PublicKeys.swift +114 -114
  13. package/ios/Sources/DVAIBridge/License/Types.swift +195 -195
  14. package/ios/Sources/DVAIBridge/Offload/OffloadConfig.swift +118 -118
  15. package/ios/Sources/DVAIBridge/ProgressEvent.swift +34 -34
  16. package/ios/Sources/DVAICoreMLCore/CoreMLBackendError.swift +19 -19
  17. package/ios/Sources/DVAICoreMLCore/CoreMLHandlers.swift +123 -123
  18. package/ios/Sources/DVAICoreMLCore/CoreMLPluginState.swift +130 -130
  19. package/ios/Sources/DVAICoreMLCore/Internal/CoreMLEngine.swift +137 -137
  20. package/ios/Sources/DVAICoreMLCore/Internal/CoreMLGenerator.swift +108 -108
  21. package/ios/Sources/DVAICoreMLCore/Internal/CoreMLSampler.swift +96 -96
  22. package/ios/Sources/DVAICoreMLCore/Internal/CoreMLTokenizer.swift +69 -69
  23. package/ios/Tests/DVAIBridgeTests/BackendSelectorTests.swift +53 -53
  24. package/ios/Tests/DVAIBridgeTests/CoreMLEngineTests.swift +18 -18
  25. package/ios/Tests/DVAIBridgeTests/CoreMLGeneratorShapeTests.swift +11 -11
  26. package/ios/Tests/DVAIBridgeTests/CoreMLHandlersTests.swift +32 -32
  27. package/ios/Tests/DVAIBridgeTests/CoreMLPluginStateTests.swift +41 -41
  28. package/ios/Tests/DVAIBridgeTests/CoreMLSamplerTests.swift +40 -40
  29. package/ios/Tests/DVAIBridgeTests/CoreMLTokenizerTests.swift +19 -19
  30. package/ios/Tests/DVAIBridgeTests/DVAIBridgeAPIShapeTests.swift +37 -37
  31. package/ios/Tests/DVAIBridgeTests/DVAIBridgeConfigTests.swift +52 -52
  32. package/ios/Tests/DVAIBridgeTests/DVAIBridgeErrorTests.swift +33 -33
  33. package/ios/Tests/DVAIBridgeTests/LicenseValidatorTests.swift +658 -658
  34. package/ios/Tests/DVAIBridgeTests/ProgressBroadcasterTests.swift +69 -69
  35. package/ios/Tests/DVAIBridgeTests/ProgressEventTests.swift +25 -25
  36. package/ios/Tests/DVAIBridgeTests/ReactiveStateTests.swift +45 -45
  37. package/ios/Tests/DVAIBridgeTests/RealModelIntegrationTest.swift +385 -359
  38. package/package.json +3 -4
  39. package/DVAIBridge.podspec +0 -120
  40. package/LICENSE +0 -51
  41. package/README.md +0 -199
@@ -1,130 +1,130 @@
1
- import Foundation
2
- import CoreML
3
- #if !COCOAPODS
4
- import DVAISharedCore // HttpServer, DVAIHandlers, HandlerContext, CORSConfig
5
- #endif
6
-
7
- /// Public PluginState mirroring DVAILlamaCore.PluginState's shape.
8
- /// Boots a Telegraph HTTP server on `127.0.0.1:<port>` (with port-fallback),
9
- /// loads the .mlmodelc model + tokenizer, and serves OpenAI-compatible
10
- /// requests via CoreMLHandlers.
11
- ///
12
- /// Requires iOS 18 / macOS 15 for MLState (KV-cache stateful decoding).
13
- @available(iOS 18.0, macOS 15.0, *)
14
- public actor CoreMLPluginState {
15
- private var httpServer: HttpServer?
16
- private var generator: CoreMLGenerator?
17
- private var modelId: String = ""
18
- private var isRunning: Bool = false
19
- private var baseUrl: String?
20
- private var port: Int?
21
-
22
- public init() {}
23
-
24
- public func start(opts: [String: Any]) async throws -> [String: Any] {
25
- if isRunning { try await stop() }
26
-
27
- guard let modelPath = opts["modelPath"] as? String, !modelPath.isEmpty else {
28
- throw CoreMLBackendError.modelLoadFailed(
29
- reason: "modelPath is required for the CoreML backend")
30
- }
31
- guard let tokenizerPath = opts["tokenizerPath"] as? String, !tokenizerPath.isEmpty else {
32
- throw CoreMLBackendError.tokenizerLoadFailed(
33
- reason: "tokenizerPath is required (path to a directory containing " +
34
- "tokenizer.json + tokenizer_config.json)")
35
- }
36
-
37
- let modelURL = URL(fileURLWithPath: modelPath)
38
- let tokenizerDir = URL(fileURLWithPath: tokenizerPath)
39
-
40
- // Optional opts with defaults — match Apple's stateful Llama-3.2
41
- // conversion conventions (snake_case, matching HF / PyTorch).
42
- let inputName = (opts["coremlInputName"] as? String) ?? "input_ids"
43
- let causalMaskName = (opts["coremlCausalMaskName"] as? String) ?? "causal_mask"
44
- let outputName = (opts["coremlOutputName"] as? String) ?? "logits"
45
- let maxContextTokens = (opts["contextSize"] as? Int) ?? 2048
46
- let temperature = (opts["temperature"] as? Double).map(Float.init) ?? 0.0
47
- let topP = (opts["topP"] as? Double).map(Float.init) ?? 1.0
48
- let topK = (opts["topK"] as? Int) ?? 0
49
- let maxNewTokens = (opts["maxNewTokens"] as? Int) ?? 512
50
- let httpBasePort = (opts["httpBasePort"] as? Int) ?? 38883
51
- let httpMaxPortAttempts = (opts["httpMaxPortAttempts"] as? Int) ?? 16
52
-
53
- // Load tokenizer first — its eosTokenId is needed by the engine.
54
- let tokenizer = try await CoreMLTokenizer(tokenizerDir: tokenizerDir)
55
- let engine = try CoreMLEngine(
56
- modelURL: modelURL,
57
- inputName: inputName,
58
- causalMaskName: causalMaskName,
59
- outputName: outputName,
60
- maxContextTokens: maxContextTokens,
61
- eosTokenId: tokenizer.eosTokenId
62
- )
63
-
64
- let sampler = CoreMLSampler(temperature: temperature, topP: topP, topK: topK)
65
- let gen = CoreMLGenerator(
66
- engine: engine,
67
- tokenizer: tokenizer,
68
- sampler: sampler,
69
- maxNewTokens: maxNewTokens
70
- )
71
-
72
- let modelIdValue = modelURL.deletingPathExtension().lastPathComponent
73
- let handlers = CoreMLHandlers(generator: gen, modelId: modelIdValue)
74
-
75
- // Build context + cors first, install routes, THEN bind —
76
- // Hummingbird requires routes at Application construction time
77
- // so the install → bind order is mandatory.
78
- let ctx = HandlerContext(modelId: modelIdValue, backendName: "coreml")
79
- // Note: plan used DispatchConfig which doesn't exist in DVAILlamaCore.
80
- // Real type is CORSConfig (public). parseCors() below maps opts → CORSConfig.
81
- let corsConfig = parseCors(opts["corsOrigin"])
82
- let server = HttpServer()
83
- await server.installRoutes(handlers: handlers, ctx: ctx, corsConfig: corsConfig)
84
-
85
- let boundPort = try await server.tryBind(
86
- basePort: httpBasePort,
87
- maxAttempts: httpMaxPortAttempts,
88
- host: "127.0.0.1"
89
- )
90
-
91
- self.httpServer = server
92
- self.generator = gen
93
- self.modelId = modelIdValue
94
- self.port = boundPort
95
- self.baseUrl = "http://127.0.0.1:\(boundPort)/v1"
96
- self.isRunning = true
97
-
98
- return [
99
- "baseUrl": self.baseUrl!,
100
- "port": boundPort,
101
- "backend": "coreml",
102
- "modelId": modelIdValue,
103
- ]
104
- }
105
-
106
- public func stop() async throws {
107
- await httpServer?.stop()
108
- httpServer = nil
109
- generator = nil
110
- modelId = ""
111
- baseUrl = nil
112
- port = nil
113
- isRunning = false
114
- }
115
-
116
- public func statusInfo() -> [String: Any] {
117
- var dict: [String: Any] = ["running": isRunning]
118
- if let baseUrl = baseUrl { dict["baseUrl"] = baseUrl }
119
- if isRunning { dict["backend"] = "coreml" }
120
- return dict
121
- }
122
-
123
- // MARK: - Private
124
-
125
- private func parseCors(_ raw: Any?) -> CORSConfig {
126
- if let s = raw as? String { return s == "*" ? .wildcard : .exact(s) }
127
- if let arr = raw as? [String] { return .allowlist(arr) }
128
- return .wildcard
129
- }
130
- }
1
+ import Foundation
2
+ import CoreML
3
+ #if !COCOAPODS
4
+ import DVAISharedCore // HttpServer, DVAIHandlers, HandlerContext, CORSConfig
5
+ #endif
6
+
7
+ /// Public PluginState mirroring DVAILlamaCore.PluginState's shape.
8
+ /// Boots a Telegraph HTTP server on `127.0.0.1:<port>` (with port-fallback),
9
+ /// loads the .mlmodelc model + tokenizer, and serves OpenAI-compatible
10
+ /// requests via CoreMLHandlers.
11
+ ///
12
+ /// Requires iOS 18 / macOS 15 for MLState (KV-cache stateful decoding).
13
+ @available(iOS 18.0, macOS 15.0, *)
14
+ public actor CoreMLPluginState {
15
+ private var httpServer: HttpServer?
16
+ private var generator: CoreMLGenerator?
17
+ private var modelId: String = ""
18
+ private var isRunning: Bool = false
19
+ private var baseUrl: String?
20
+ private var port: Int?
21
+
22
+ public init() {}
23
+
24
+ public func start(opts: [String: Any]) async throws -> [String: Any] {
25
+ if isRunning { try await stop() }
26
+
27
+ guard let modelPath = opts["modelPath"] as? String, !modelPath.isEmpty else {
28
+ throw CoreMLBackendError.modelLoadFailed(
29
+ reason: "modelPath is required for the CoreML backend")
30
+ }
31
+ guard let tokenizerPath = opts["tokenizerPath"] as? String, !tokenizerPath.isEmpty else {
32
+ throw CoreMLBackendError.tokenizerLoadFailed(
33
+ reason: "tokenizerPath is required (path to a directory containing " +
34
+ "tokenizer.json + tokenizer_config.json)")
35
+ }
36
+
37
+ let modelURL = URL(fileURLWithPath: modelPath)
38
+ let tokenizerDir = URL(fileURLWithPath: tokenizerPath)
39
+
40
+ // Optional opts with defaults — match Apple's stateful Llama-3.2
41
+ // conversion conventions (snake_case, matching HF / PyTorch).
42
+ let inputName = (opts["coremlInputName"] as? String) ?? "input_ids"
43
+ let causalMaskName = (opts["coremlCausalMaskName"] as? String) ?? "causal_mask"
44
+ let outputName = (opts["coremlOutputName"] as? String) ?? "logits"
45
+ let maxContextTokens = (opts["contextSize"] as? Int) ?? 2048
46
+ let temperature = (opts["temperature"] as? Double).map(Float.init) ?? 0.0
47
+ let topP = (opts["topP"] as? Double).map(Float.init) ?? 1.0
48
+ let topK = (opts["topK"] as? Int) ?? 0
49
+ let maxNewTokens = (opts["maxNewTokens"] as? Int) ?? 512
50
+ let httpBasePort = (opts["httpBasePort"] as? Int) ?? 38883
51
+ let httpMaxPortAttempts = (opts["httpMaxPortAttempts"] as? Int) ?? 16
52
+
53
+ // Load tokenizer first — its eosTokenId is needed by the engine.
54
+ let tokenizer = try await CoreMLTokenizer(tokenizerDir: tokenizerDir)
55
+ let engine = try CoreMLEngine(
56
+ modelURL: modelURL,
57
+ inputName: inputName,
58
+ causalMaskName: causalMaskName,
59
+ outputName: outputName,
60
+ maxContextTokens: maxContextTokens,
61
+ eosTokenId: tokenizer.eosTokenId
62
+ )
63
+
64
+ let sampler = CoreMLSampler(temperature: temperature, topP: topP, topK: topK)
65
+ let gen = CoreMLGenerator(
66
+ engine: engine,
67
+ tokenizer: tokenizer,
68
+ sampler: sampler,
69
+ maxNewTokens: maxNewTokens
70
+ )
71
+
72
+ let modelIdValue = modelURL.deletingPathExtension().lastPathComponent
73
+ let handlers = CoreMLHandlers(generator: gen, modelId: modelIdValue)
74
+
75
+ // Build context + cors first, install routes, THEN bind —
76
+ // Hummingbird requires routes at Application construction time
77
+ // so the install → bind order is mandatory.
78
+ let ctx = HandlerContext(modelId: modelIdValue, backendName: "coreml")
79
+ // Note: plan used DispatchConfig which doesn't exist in DVAILlamaCore.
80
+ // Real type is CORSConfig (public). parseCors() below maps opts → CORSConfig.
81
+ let corsConfig = parseCors(opts["corsOrigin"])
82
+ let server = HttpServer()
83
+ await server.installRoutes(handlers: handlers, ctx: ctx, corsConfig: corsConfig)
84
+
85
+ let boundPort = try await server.tryBind(
86
+ basePort: httpBasePort,
87
+ maxAttempts: httpMaxPortAttempts,
88
+ host: "127.0.0.1"
89
+ )
90
+
91
+ self.httpServer = server
92
+ self.generator = gen
93
+ self.modelId = modelIdValue
94
+ self.port = boundPort
95
+ self.baseUrl = "http://127.0.0.1:\(boundPort)/v1"
96
+ self.isRunning = true
97
+
98
+ return [
99
+ "baseUrl": self.baseUrl!,
100
+ "port": boundPort,
101
+ "backend": "coreml",
102
+ "modelId": modelIdValue,
103
+ ]
104
+ }
105
+
106
+ public func stop() async throws {
107
+ await httpServer?.stop()
108
+ httpServer = nil
109
+ generator = nil
110
+ modelId = ""
111
+ baseUrl = nil
112
+ port = nil
113
+ isRunning = false
114
+ }
115
+
116
+ public func statusInfo() -> [String: Any] {
117
+ var dict: [String: Any] = ["running": isRunning]
118
+ if let baseUrl = baseUrl { dict["baseUrl"] = baseUrl }
119
+ if isRunning { dict["backend"] = "coreml" }
120
+ return dict
121
+ }
122
+
123
+ // MARK: - Private
124
+
125
+ private func parseCors(_ raw: Any?) -> CORSConfig {
126
+ if let s = raw as? String { return s == "*" ? .wildcard : .exact(s) }
127
+ if let arr = raw as? [String] { return .allowlist(arr) }
128
+ return .wildcard
129
+ }
130
+ }
@@ -1,137 +1,137 @@
1
- import Foundation
2
- import CoreML
3
-
4
- /// Wraps an `MLModel` plus the shape conventions our CoreML LLM checkpoints
5
- /// follow. `makeConversationState()` produces a fresh `MLState` for each
6
- /// conversation so token-by-token decoding can preserve KV-cache across calls.
7
- ///
8
- /// iOS 18 / macOS 15 API notes:
9
- /// - `MLModel.makeState()` returns `MLState` (non-optional, throws is not in
10
- /// the signature — it can still crash at runtime on non-stateful models).
11
- /// - `MLModel.prediction(from:using:options:)` takes state via the `using:`
12
- /// label, NOT `state:`. Verified against Apple's CoreML docs.
13
- @available(iOS 18.0, macOS 15.0, *)
14
- internal final class CoreMLEngine: @unchecked Sendable {
15
- let model: MLModel
16
- /// Name of the token-id input feature. Apple-converted Llama-3.2 stateful
17
- /// checkpoints use `input_ids` (snake_case, matching HF / PyTorch
18
- /// convention). Override via `opts["coremlInputName"]` for non-standard
19
- /// checkpoints.
20
- let inputName: String
21
- /// Name of the causal-mask input feature. Apple-converted stateful
22
- /// checkpoints declare a `causal_mask` Float16 multiarray of shape
23
- /// `[1, 1, q_len, kv_len]` — the model uses it inside
24
- /// `Ios18.scaledDotProductAttention`. Empty string disables the
25
- /// causal-mask input (for older or simpler checkpoints that don't
26
- /// declare it). Override via `opts["coremlCausalMaskName"]`.
27
- let causalMaskName: String
28
- let outputName: String // default: "logits"
29
- let maxContextTokens: Int // from opts; default 2048
30
- let eosTokenId: Int // from tokenizer or opts
31
-
32
- init(
33
- modelURL: URL,
34
- inputName: String = "input_ids",
35
- causalMaskName: String = "causal_mask",
36
- outputName: String = "logits",
37
- maxContextTokens: Int = 2048,
38
- eosTokenId: Int,
39
- computeUnits: MLComputeUnits = .all
40
- ) throws {
41
- let cfg = MLModelConfiguration()
42
- cfg.computeUnits = computeUnits
43
- do {
44
- self.model = try MLModel(contentsOf: modelURL, configuration: cfg)
45
- } catch {
46
- throw CoreMLBackendError.modelLoadFailed(reason: "\(error)")
47
- }
48
- self.inputName = inputName
49
- self.causalMaskName = causalMaskName
50
- self.outputName = outputName
51
- self.maxContextTokens = maxContextTokens
52
- self.eosTokenId = eosTokenId
53
- }
54
-
55
- /// Make a fresh KV-cache state for a new conversation.
56
- /// Wraps `MLModel.makeState()` (iOS 18 / macOS 15).
57
- /// Note: `makeState()` is NOT throwing in Apple's API; it returns `MLState`
58
- /// directly. Non-stateful models will produce a state object that has no
59
- /// effect — they won't crash here, but predictions will behave as if
60
- /// stateless. Real validation happens at prediction time.
61
- func makeConversationState() -> MLState {
62
- // matches MLModel.makeState() iOS 18 non-throwing signature
63
- return model.makeState()
64
- }
65
-
66
- /// Run a single-token forward pass using the given KV-cache state.
67
- ///
68
- /// Uses `MLModel.prediction(from:using:options:)` — the `using:` label
69
- /// carries the `MLState` object (not `state:`). Verified against Apple docs.
70
- ///
71
- /// - Parameters:
72
- /// - token: New token id to feed (the K/V is appended to `state` by
73
- /// the model's `Ios18.writeState` op as a side-effect).
74
- /// - kvCachePosition: 0-based position of the new token in the
75
- /// conversation. The first prompt token is position 0, second is 1,
76
- /// etc. Used to size the causal-mask input. Caller increments this
77
- /// across runStep calls within the same conversation.
78
- /// - state: KV-cache `MLState` from `makeConversationState()`.
79
- func runStep(token: Int, kvCachePosition: Int, state: MLState) throws -> MLMultiArray {
80
- var features: [String: MLFeatureValue] = [:]
81
-
82
- // input_ids: [1, 1] Int32 with the new token. Direct memory write
83
- // (rather than NSNumber subscript) matches Apple's documented
84
- // pattern for primitive multiarray data and avoids unnecessary
85
- // bridging overhead.
86
- //
87
- // KNOWN ISSUE: on the reference Apple-converted Llama-3.2 stateful
88
- // 4-bit checkpoint, the FIRST `model.prediction(from:using:)` call
89
- // crashes hard inside CoreML's C++ IR layer with:
90
- //
91
- // Error: Cannot retrieve vector from IRValue format int32
92
- //
93
- // The crash is reproducible on BOTH iOS Simulator and macOS-native,
94
- // which rules out the previously-suspected simulator-only Espresso
95
- // limitation. Verified that:
96
- // - Model loads fine (no "Failed to build execution plan").
97
- // - input_ids name + shape match the model description.
98
- // - causal_mask name + shape match Apple's published convention.
99
- //
100
- // The error manifests as a process crash (xctest exits unexpectedly,
101
- // not a Swift Error throw), so callers can't try/catch it. The
102
- // RealModelIntegrationTest gates the test off until the cause is
103
- // understood. Live debugging on a real iOS device with Instruments
104
- // is the next step. See:
105
- // packages/dvai-bridge-ios/ios/Tests/DVAIBridgeTests/RealModelIntegrationTest.swift
106
- let inputArr = try MLMultiArray(shape: [1, 1], dataType: .int32)
107
- inputArr.dataPointer.bindMemory(to: Int32.self, capacity: 1).pointee = Int32(token)
108
- features[inputName] = MLFeatureValue(multiArray: inputArr)
109
-
110
- // causal_mask: [1, 1, 1, kvCachePosition+1] Float16, all zeros.
111
- //
112
- // For autoregressive single-token decoding the new query attends to
113
- // every K/V position seen so far (0..kvCachePosition inclusive), so
114
- // the mask is all-zeros (zero = unmasked, large-negative = masked).
115
- // Apple's stateful Llama-3.2 checkpoints declare this input as
116
- // Float16 with shape flexibility `[1, 1, 1...2048, 1...2048]`; we
117
- // produce the minimal slice for the current step.
118
- if !causalMaskName.isEmpty,
119
- model.modelDescription.inputDescriptionsByName[causalMaskName] != nil
120
- {
121
- let kvLen = max(1, kvCachePosition + 1)
122
- let mask = try MLMultiArray(shape: [1, 1, 1, NSNumber(value: kvLen)], dataType: .float16)
123
- // Float16 zero == bit pattern 0x0000, so memset(0) suffices.
124
- memset(mask.dataPointer, 0, mask.count * MemoryLayout<UInt16>.size)
125
- features[causalMaskName] = MLFeatureValue(multiArray: mask)
126
- }
127
-
128
- let input = try MLDictionaryFeatureProvider(dictionary: features)
129
- // `prediction(from:using:options:)` is synchronous in Apple's CoreML iOS 18 API.
130
- // Wrapped in CoreMLGenerator via async Task to avoid blocking the caller's thread.
131
- let output = try model.prediction(from: input, using: state, options: MLPredictionOptions())
132
- guard let logits = output.featureValue(for: outputName)?.multiArrayValue else {
133
- throw CoreMLBackendError.generationFailed(reason: "no '\(outputName)' output in model prediction")
134
- }
135
- return logits
136
- }
137
- }
1
+ import Foundation
2
+ import CoreML
3
+
4
+ /// Wraps an `MLModel` plus the shape conventions our CoreML LLM checkpoints
5
+ /// follow. `makeConversationState()` produces a fresh `MLState` for each
6
+ /// conversation so token-by-token decoding can preserve KV-cache across calls.
7
+ ///
8
+ /// iOS 18 / macOS 15 API notes:
9
+ /// - `MLModel.makeState()` returns `MLState` (non-optional, throws is not in
10
+ /// the signature — it can still crash at runtime on non-stateful models).
11
+ /// - `MLModel.prediction(from:using:options:)` takes state via the `using:`
12
+ /// label, NOT `state:`. Verified against Apple's CoreML docs.
13
+ @available(iOS 18.0, macOS 15.0, *)
14
+ internal final class CoreMLEngine: @unchecked Sendable {
15
+ let model: MLModel
16
+ /// Name of the token-id input feature. Apple-converted Llama-3.2 stateful
17
+ /// checkpoints use `input_ids` (snake_case, matching HF / PyTorch
18
+ /// convention). Override via `opts["coremlInputName"]` for non-standard
19
+ /// checkpoints.
20
+ let inputName: String
21
+ /// Name of the causal-mask input feature. Apple-converted stateful
22
+ /// checkpoints declare a `causal_mask` Float16 multiarray of shape
23
+ /// `[1, 1, q_len, kv_len]` — the model uses it inside
24
+ /// `Ios18.scaledDotProductAttention`. Empty string disables the
25
+ /// causal-mask input (for older or simpler checkpoints that don't
26
+ /// declare it). Override via `opts["coremlCausalMaskName"]`.
27
+ let causalMaskName: String
28
+ let outputName: String // default: "logits"
29
+ let maxContextTokens: Int // from opts; default 2048
30
+ let eosTokenId: Int // from tokenizer or opts
31
+
32
+ init(
33
+ modelURL: URL,
34
+ inputName: String = "input_ids",
35
+ causalMaskName: String = "causal_mask",
36
+ outputName: String = "logits",
37
+ maxContextTokens: Int = 2048,
38
+ eosTokenId: Int,
39
+ computeUnits: MLComputeUnits = .all
40
+ ) throws {
41
+ let cfg = MLModelConfiguration()
42
+ cfg.computeUnits = computeUnits
43
+ do {
44
+ self.model = try MLModel(contentsOf: modelURL, configuration: cfg)
45
+ } catch {
46
+ throw CoreMLBackendError.modelLoadFailed(reason: "\(error)")
47
+ }
48
+ self.inputName = inputName
49
+ self.causalMaskName = causalMaskName
50
+ self.outputName = outputName
51
+ self.maxContextTokens = maxContextTokens
52
+ self.eosTokenId = eosTokenId
53
+ }
54
+
55
+ /// Make a fresh KV-cache state for a new conversation.
56
+ /// Wraps `MLModel.makeState()` (iOS 18 / macOS 15).
57
+ /// Note: `makeState()` is NOT throwing in Apple's API; it returns `MLState`
58
+ /// directly. Non-stateful models will produce a state object that has no
59
+ /// effect — they won't crash here, but predictions will behave as if
60
+ /// stateless. Real validation happens at prediction time.
61
+ func makeConversationState() -> MLState {
62
+ // matches MLModel.makeState() iOS 18 non-throwing signature
63
+ return model.makeState()
64
+ }
65
+
66
+ /// Run a single-token forward pass using the given KV-cache state.
67
+ ///
68
+ /// Uses `MLModel.prediction(from:using:options:)` — the `using:` label
69
+ /// carries the `MLState` object (not `state:`). Verified against Apple docs.
70
+ ///
71
+ /// - Parameters:
72
+ /// - token: New token id to feed (the K/V is appended to `state` by
73
+ /// the model's `Ios18.writeState` op as a side-effect).
74
+ /// - kvCachePosition: 0-based position of the new token in the
75
+ /// conversation. The first prompt token is position 0, second is 1,
76
+ /// etc. Used to size the causal-mask input. Caller increments this
77
+ /// across runStep calls within the same conversation.
78
+ /// - state: KV-cache `MLState` from `makeConversationState()`.
79
+ func runStep(token: Int, kvCachePosition: Int, state: MLState) throws -> MLMultiArray {
80
+ var features: [String: MLFeatureValue] = [:]
81
+
82
+ // input_ids: [1, 1] Int32 with the new token. Direct memory write
83
+ // (rather than NSNumber subscript) matches Apple's documented
84
+ // pattern for primitive multiarray data and avoids unnecessary
85
+ // bridging overhead.
86
+ //
87
+ // KNOWN ISSUE: on the reference Apple-converted Llama-3.2 stateful
88
+ // 4-bit checkpoint, the FIRST `model.prediction(from:using:)` call
89
+ // crashes hard inside CoreML's C++ IR layer with:
90
+ //
91
+ // Error: Cannot retrieve vector from IRValue format int32
92
+ //
93
+ // The crash is reproducible on BOTH iOS Simulator and macOS-native,
94
+ // which rules out the previously-suspected simulator-only Espresso
95
+ // limitation. Verified that:
96
+ // - Model loads fine (no "Failed to build execution plan").
97
+ // - input_ids name + shape match the model description.
98
+ // - causal_mask name + shape match Apple's published convention.
99
+ //
100
+ // The error manifests as a process crash (xctest exits unexpectedly,
101
+ // not a Swift Error throw), so callers can't try/catch it. The
102
+ // RealModelIntegrationTest gates the test off until the cause is
103
+ // understood. Live debugging on a real iOS device with Instruments
104
+ // is the next step. See:
105
+ // packages/dvai-bridge-ios/ios/Tests/DVAIBridgeTests/RealModelIntegrationTest.swift
106
+ let inputArr = try MLMultiArray(shape: [1, 1], dataType: .int32)
107
+ inputArr.dataPointer.bindMemory(to: Int32.self, capacity: 1).pointee = Int32(token)
108
+ features[inputName] = MLFeatureValue(multiArray: inputArr)
109
+
110
+ // causal_mask: [1, 1, 1, kvCachePosition+1] Float16, all zeros.
111
+ //
112
+ // For autoregressive single-token decoding the new query attends to
113
+ // every K/V position seen so far (0..kvCachePosition inclusive), so
114
+ // the mask is all-zeros (zero = unmasked, large-negative = masked).
115
+ // Apple's stateful Llama-3.2 checkpoints declare this input as
116
+ // Float16 with shape flexibility `[1, 1, 1...2048, 1...2048]`; we
117
+ // produce the minimal slice for the current step.
118
+ if !causalMaskName.isEmpty,
119
+ model.modelDescription.inputDescriptionsByName[causalMaskName] != nil
120
+ {
121
+ let kvLen = max(1, kvCachePosition + 1)
122
+ let mask = try MLMultiArray(shape: [1, 1, 1, NSNumber(value: kvLen)], dataType: .float16)
123
+ // Float16 zero == bit pattern 0x0000, so memset(0) suffices.
124
+ memset(mask.dataPointer, 0, mask.count * MemoryLayout<UInt16>.size)
125
+ features[causalMaskName] = MLFeatureValue(multiArray: mask)
126
+ }
127
+
128
+ let input = try MLDictionaryFeatureProvider(dictionary: features)
129
+ // `prediction(from:using:options:)` is synchronous in Apple's CoreML iOS 18 API.
130
+ // Wrapped in CoreMLGenerator via async Task to avoid blocking the caller's thread.
131
+ let output = try model.prediction(from: input, using: state, options: MLPredictionOptions())
132
+ guard let logits = output.featureValue(for: outputName)?.multiArrayValue else {
133
+ throw CoreMLBackendError.generationFailed(reason: "no '\(outputName)' output in model prediction")
134
+ }
135
+ return logits
136
+ }
137
+ }