@dvai-bridge/ios-mlx-core 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Deep Voice Ai Limited - Software License Agreement
|
|
2
|
+
|
|
3
|
+
**Version 1.0.0**
|
|
4
|
+
|
|
5
|
+
This License Agreement governs the use of the DVAI-Bridge software (the "Software"). By downloading, installing, or using the Software, you agree to be bound by the terms of this License.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 1. LICENSE GRANTS
|
|
10
|
+
|
|
11
|
+
### 1.1 Development and Personal Use (Free Tier)
|
|
12
|
+
Deep Voice Ai Limited ("Licensor") grants you a non-exclusive, non-transferable, royalty-free license to use the Software solely for:
|
|
13
|
+
- Internal development and testing purposes.
|
|
14
|
+
- Non-commercial personal projects.
|
|
15
|
+
- Academic and non-profit research.
|
|
16
|
+
|
|
17
|
+
### 1.2 Commercial Use (Paid Tier)
|
|
18
|
+
Any use of the Software for **Commercial Purposes** requires a separate, paid Commercial License from Licensor. "Commercial Purposes" include:
|
|
19
|
+
- Use in production environments.
|
|
20
|
+
- Integration into revenue-generating products or services.
|
|
21
|
+
- Distribution to third-party customers for a fee.
|
|
22
|
+
- Use by an entity with more than $100,000 USD in annual revenue.
|
|
23
|
+
|
|
24
|
+
To obtain a Commercial License, contact `info@deepvoiceai.co` or visit `https://deepvoiceai.co/licensing`.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## 2. RESTRICTIONS
|
|
29
|
+
Except as expressly permitted, you may not:
|
|
30
|
+
- Sublicense, rent, lease, or resell the Software without express permission.
|
|
31
|
+
- Remove any proprietary notices or branding from the Software.
|
|
32
|
+
- Use the Software for any illegal or malicious purposes.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 3. INTELLECTUAL PROPERTY
|
|
37
|
+
The Software is owned by **Deep Voice Ai Limited** and is protected by copyright and intellectual property laws. This agreement does not transfer ownership of the Software.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## 4. NO WARRANTY
|
|
42
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. IN NO EVENT SHALL THE LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## 5. GOVERNING LAW
|
|
47
|
+
This License shall be governed by and construed in accordance with the laws of the jurisdiction where Deep Voice Ai Limited is registered.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
© 2026 Deep Voice Ai Limited. All rights reserved.
|
package/Package.swift
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
// swift-tools-version: 5.9
|
|
2
|
+
import PackageDescription
|
|
3
|
+
|
|
4
|
+
// MLX runs on Apple Silicon GPU/Neural Engine via Apple's MLX Swift framework.
|
|
5
|
+
// Platform floor: iOS 17 / macOS 14 (mlx-swift-lm's own minimum).
|
|
6
|
+
// Runtime requirement: Apple Silicon (no Intel Mac, no iOS Simulator on
|
|
7
|
+
// Intel hosts). The library compiles and links on Intel sims but `MLX.GPU`
|
|
8
|
+
// always reports unavailable, so any `start()` call returns a clear error.
|
|
9
|
+
let package = Package(
|
|
10
|
+
name: "DVAIMLXCore",
|
|
11
|
+
platforms: [.iOS(.v17), .macOS(.v14)],
|
|
12
|
+
products: [
|
|
13
|
+
.library(name: "DVAIMLXCore", targets: ["DVAIMLXCore"]),
|
|
14
|
+
],
|
|
15
|
+
dependencies: [
|
|
16
|
+
// mlx-swift-lm bundles MLXLLM (LLM inference) + MLXLMCommon
|
|
17
|
+
// (ChatSession, ModelContainer) and pulls mlx-swift + swift-
|
|
18
|
+
// transformers transitively. We pin to 2.31.x because its
|
|
19
|
+
// `loadModelContainer(id:)` convenience API (HuggingFace Hub-
|
|
20
|
+
// backed) is the simplest path to a working model load. The
|
|
21
|
+
// 3.x line introduced an explicit Downloader + TokenizerLoader
|
|
22
|
+
// requirement that would force us to build a HF download/auth
|
|
23
|
+
// story alongside this scaffold; defer to Phase 3D.
|
|
24
|
+
.package(url: "https://github.com/ml-explore/mlx-swift-lm.git", "2.31.3" ..< "3.0.0"),
|
|
25
|
+
// Shared HTTP-server / handler-dispatch types. Note: previously
|
|
26
|
+
// depended on DVAILlamaCore for these types, but that transitively
|
|
27
|
+
// pulled the llama.xcframework into MLX-only builds. The
|
|
28
|
+
// shared-core extraction breaks that coupling so MLX consumers
|
|
29
|
+
// don't drag a binary they never use. DVAISharedCore brings in
|
|
30
|
+
// Hummingbird transitively as of v3.2.0 — the iOS HTTP server
|
|
31
|
+
// backbone is no longer Telegraph.
|
|
32
|
+
.package(path: "../dvai-bridge-ios-shared-core"),
|
|
33
|
+
],
|
|
34
|
+
targets: [
|
|
35
|
+
.target(
|
|
36
|
+
name: "DVAIMLXCore",
|
|
37
|
+
dependencies: [
|
|
38
|
+
.product(name: "MLXLLM", package: "mlx-swift-lm"),
|
|
39
|
+
.product(name: "MLXLMCommon", package: "mlx-swift-lm"),
|
|
40
|
+
.product(name: "DVAISharedCore", package: "dvai-bridge-ios-shared-core"),
|
|
41
|
+
],
|
|
42
|
+
path: "ios/Sources/DVAIMLXCore"
|
|
43
|
+
),
|
|
44
|
+
.testTarget(
|
|
45
|
+
name: "DVAIMLXCoreTests",
|
|
46
|
+
dependencies: ["DVAIMLXCore"],
|
|
47
|
+
path: "ios/Tests/DVAIMLXCoreTests"
|
|
48
|
+
),
|
|
49
|
+
]
|
|
50
|
+
)
|
package/README.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# DVAI-Bridge
|
|
4
|
+
|
|
5
|
+
<!-- [](https://github.com/Westenets/dvai-bridge/actions/workflows/smoke-real-models.yml) -->
|
|
6
|
+
|
|
7
|
+
[](LICENSE)      
|
|
8
|
+
|
|
9
|
+
> **The local OpenAI server you embed inside your app.**
|
|
10
|
+
> One library. One HTTP wire. Every platform. Zero install for your users.
|
|
11
|
+
|
|
12
|
+
**Docs:** [dvai-bridge.deepvoiceai.co](https://dvai-bridge.deepvoiceai.co)
|
|
13
|
+
|
|
14
|
+
```ts
|
|
15
|
+
import { DVAI } from "@dvai-bridge/core";
|
|
16
|
+
import OpenAI from "openai";
|
|
17
|
+
|
|
18
|
+
const dvai = new DVAI({ backend: "transformers" });
|
|
19
|
+
await dvai.initialize();
|
|
20
|
+
|
|
21
|
+
const openai = new OpenAI({ baseURL: dvai.baseUrl, apiKey: "ignored" });
|
|
22
|
+
await openai.chat.completions.create({
|
|
23
|
+
model: dvai.transformersModelId,
|
|
24
|
+
messages: [{ role: "user", content: "Hello!" }],
|
|
25
|
+
});
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
That's it. A real OpenAI-compatible server is now running inside your app's
|
|
29
|
+
own process. Point any OpenAI client — LangChain, the OpenAI SDK, the Vercel
|
|
30
|
+
AI SDK, anything — at `dvai.baseUrl` and your agent code keeps working.
|
|
31
|
+
|
|
32
|
+
Built by **[Deep Voice AI](https://deepvoiceai.co)**.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Why it exists
|
|
37
|
+
|
|
38
|
+
Local AI works beautifully on a laptop with **Ollama + LangChain**. Then you
|
|
39
|
+
try to ship the app and your users don't have Ollama. Mobile can't run it.
|
|
40
|
+
Corporate IT won't add another daemon. So you reinvent the same plumbing —
|
|
41
|
+
spawn an inference engine, bind a port, translate to OpenAI HTTP, handle
|
|
42
|
+
CORS, manage lifecycle, wrap the accelerator of the day per platform — and
|
|
43
|
+
do it all over again for every target OS.
|
|
44
|
+
|
|
45
|
+
DVAI-Bridge is that plumbing, packaged as a library, for every client
|
|
46
|
+
platform.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## What you get
|
|
51
|
+
|
|
52
|
+
- **One OpenAI HTTP surface.** Bound on `127.0.0.1` (or `0.0.0.0` for
|
|
53
|
+
device-to-device). Streaming, embeddings, models, recovery — all built in.
|
|
54
|
+
- **Six SDKs.** `@dvai-bridge/core` + `react` + `vanilla` + `capacitor`,
|
|
55
|
+
`DVAIBridge` (Swift / iOS), `co.deepvoiceai:dvai-bridge` (Kotlin / Android),
|
|
56
|
+
`@dvai-bridge/react-native`, `dvai_bridge` (Flutter), `co.deepvoiceai.dvai-bridge` (.NET).
|
|
57
|
+
- **Nine backends.** WebLLM, Transformers.js, llama.cpp, Apple Foundation
|
|
58
|
+
Models, MLX, CoreML / ANE, MediaPipe LLM, LiteRT, ONNX Runtime GenAI —
|
|
59
|
+
selected per-platform, invisible to your agent code.
|
|
60
|
+
- **Native acceleration** wherever it runs: WebGPU in browsers, CUDA / Metal
|
|
61
|
+
/ Vulkan / DirectML on desktop, ANE / Metal / MLX on iOS, NNAPI / QNN
|
|
62
|
+
Hexagon / GPU delegate on Android.
|
|
63
|
+
- **Multimodal.** Text, image, audio, video — declarative loader for
|
|
64
|
+
cutting-edge models (Gemma 4, LLaVA, Idefics) without waiting for library
|
|
65
|
+
updates.
|
|
66
|
+
- **Distributed inference (v3.0+).** Phone too slow? Offload to your laptop
|
|
67
|
+
on the same Wi-Fi via mDNS pairing — same OpenAI wire, transparent to
|
|
68
|
+
your code. Internet path via a self-hostable rendezvous server.
|
|
69
|
+
- **DVAI Hub (v3.1+).** A first-party desktop utility that turns any device
|
|
70
|
+
into a strong-peer for the rest of your fleet. Brand-neutral install via
|
|
71
|
+
Homebrew / winget / GitHub Releases, OR fork it for your own branded
|
|
72
|
+
companion. Routes through Ollama / LM Studio / vLLM / llama-server /
|
|
73
|
+
llamafile if you've already got those running.
|
|
74
|
+
- **Zero user install.** It's a library, not a daemon. `npm install`,
|
|
75
|
+
`cocoapods`, gradle — your CI already has the muscle for it.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Supported platforms
|
|
80
|
+
|
|
81
|
+
| Stack | Package | Backends |
|
|
82
|
+
| --- | --- | --- |
|
|
83
|
+
| Browser (React, Vue, Svelte, vanilla JS) | `@dvai-bridge/core` + `react` / `vanilla` | WebLLM (WebGPU), Transformers.js (WebGPU / WASM SIMD) |
|
|
84
|
+
| Node / Bun / Electron | `@dvai-bridge/core` | Transformers.js, native llama.cpp |
|
|
85
|
+
| Capacitor hybrid mobile | `@dvai-bridge/capacitor` + backend slice | Native llama.cpp (Metal iOS, Vulkan / CPU Android) |
|
|
86
|
+
| iOS native (Swift) | `DVAIBridge` (SPM / CocoaPods) | llama.cpp (Metal), CoreML / ANE, Apple Foundation Models, MLX |
|
|
87
|
+
| Android native (Kotlin / Java) | `co.deepvoiceai:dvai-bridge` (AAR) | llama.cpp, MediaPipe LLM, LiteRT, NNAPI / QNN |
|
|
88
|
+
| React Native (≥0.77, TurboModule) | `@dvai-bridge/react-native` | All iOS + Android backends (delegates) |
|
|
89
|
+
| Flutter (≥3.39) | `dvai_bridge` (pub.dev) | All iOS + Android backends (Pigeon channels) |
|
|
90
|
+
| .NET 10 LTS (MAUI / Avalonia / WinUI / Catalyst / desktop) | `co.deepvoiceai.dvai-bridge*` (NuGet) | iOS / Android delegate to native; desktop = llama.cpp + ONNX Runtime GenAI + ML.NET |
|
|
91
|
+
|
|
92
|
+
Full quickstart per platform: [dvai-bridge.deepvoiceai.co/guide/getting-started](https://dvai-bridge.deepvoiceai.co/guide/getting-started)
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Examples
|
|
97
|
+
|
|
98
|
+
```ts
|
|
99
|
+
// React
|
|
100
|
+
import { DVAIProvider, useDVAI } from "@dvai-bridge/react";
|
|
101
|
+
<DVAIProvider config={{ backend: "transformers" }}>
|
|
102
|
+
<Chat />
|
|
103
|
+
</DVAIProvider>;
|
|
104
|
+
function Chat() {
|
|
105
|
+
const { isReady, baseUrl } = useDVAI();
|
|
106
|
+
return isReady ? <div>Local AI live at {baseUrl}</div> : <Loading />;
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
```swift
|
|
111
|
+
// iOS
|
|
112
|
+
let server = try await DVAIBridge.shared.start()
|
|
113
|
+
// server.baseUrl = "http://127.0.0.1:38883/v1"
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
```kotlin
|
|
117
|
+
// Android
|
|
118
|
+
val server = DVAIBridge.start(context)
|
|
119
|
+
// server.baseUrl = "http://127.0.0.1:38883/v1"
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
```dart
|
|
123
|
+
// Flutter
|
|
124
|
+
final state = await DVAIBridge.instance.start(
|
|
125
|
+
backend: BackendKind.auto,
|
|
126
|
+
modelPath: '/path/to/model.gguf',
|
|
127
|
+
);
|
|
128
|
+
// state.baseUrl = "http://127.0.0.1:38883/v1"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
```csharp
|
|
132
|
+
// .NET
|
|
133
|
+
var server = await DVAIBridge.Shared.StartAsync(new StartOptions {
|
|
134
|
+
Backend = BackendKind.Auto,
|
|
135
|
+
ModelPath = "/path/to/model.gguf",
|
|
136
|
+
});
|
|
137
|
+
// server.BaseUrl = "http://127.0.0.1:38883/v1"
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Multimodal, streaming, embeddings, distributed offload, the Hub —
|
|
141
|
+
everything's at the [docs site](https://dvai-bridge.deepvoiceai.co).
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## What's new in v3.1
|
|
146
|
+
|
|
147
|
+
- **DVAI Hub** — Tauri desktop utility that's the strong-peer side of v3
|
|
148
|
+
distributed inference. `brew install deepvoiceai/dvai-hub/dvai-hub` (or
|
|
149
|
+
`winget install DeepVoiceAI.DVAIHub`) → mobile apps on the same Wi-Fi
|
|
150
|
+
pair with it and offload heavy inference. [Guide →](https://dvai-bridge.deepvoiceai.co/guide/dvai-hub)
|
|
151
|
+
- **External-engine bridge.** Hub surfaces Ollama / LM Studio / vLLM /
|
|
152
|
+
llama-server / llamafile as additional backend pools so paired apps
|
|
153
|
+
serve from whatever's already cached. Opt-in per engine.
|
|
154
|
+
- **Strict substitution policy.** Models with mismatched family / version /
|
|
155
|
+
size / type are refused by default; quant-only mismatches gated behind a
|
|
156
|
+
per-pairing `preferBetterQuant` flag. No silent mis-routing.
|
|
157
|
+
- **HMAC-signed identity** on `/v1/chat/completions`. Per-app audit logs
|
|
158
|
+
surface who served what, with structured `(appId, peerDeviceId,
|
|
159
|
+
engine, requestedModel, servedModel, outcome)` rows.
|
|
160
|
+
- **Library finalization.** `httpBindHost` (LAN bind), `chatCompletionInterceptor`
|
|
161
|
+
(extension point), HMAC primitives re-exported, `/v1/dvai/*` routes
|
|
162
|
+
actually dispatched, TransformersBackend Node-mode device fix.
|
|
163
|
+
[Migration v3.0 → v3.1 →](https://dvai-bridge.deepvoiceai.co/migration/v3.0-to-v3.1)
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Robustness
|
|
168
|
+
|
|
169
|
+
Streaming-correct (SSE passthrough + blank-chunk detection), generation
|
|
170
|
+
timeout, automatic engine-state recovery on fatal errors, port fallback,
|
|
171
|
+
worker offloading, Private Network Access ready, CORS configured. The
|
|
172
|
+
boring substrate so your agent code never has to think about it.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Licensing
|
|
177
|
+
|
|
178
|
+
Dual: **free for development & personal use** on `localhost` (verified at
|
|
179
|
+
runtime). **Commercial use** requires a license key — `info@deepvoiceai.co`.
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Contributing
|
|
184
|
+
|
|
185
|
+
PRs welcome.
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
pnpm install
|
|
189
|
+
pnpm build
|
|
190
|
+
bash scripts/build-all.sh # full matrix (auto-skips per-host)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the PR flow. Per-platform
|
|
194
|
+
contributor docs (iOS / Android / RN / Flutter / .NET) under
|
|
195
|
+
[`docs/development/`](./docs/development/).
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
© Deep Voice AI Limited. All rights reserved.
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
// MLXHandlers — DVAIHandlers conformance for the MLX backend.
|
|
2
|
+
//
|
|
3
|
+
// Wraps mlx-swift-lm's MLXLMCommon.ChatSession into our OpenAI-compatible
|
|
4
|
+
// HTTP surface. The actual model load happens at MLXPluginState.start()
|
|
5
|
+
// time; by the time these methods are called, `modelContainer` is ready.
|
|
6
|
+
|
|
7
|
+
import Foundation
|
|
8
|
+
#if !COCOAPODS
|
|
9
|
+
import DVAISharedCore
|
|
10
|
+
#endif
|
|
11
|
+
import MLXLMCommon
|
|
12
|
+
|
|
13
|
+
public final class MLXHandlers: DVAIHandlers, @unchecked Sendable {
|
|
14
|
+
private let modelId: String
|
|
15
|
+
private let modelContainer: ModelContainer
|
|
16
|
+
|
|
17
|
+
/// `ChatSession` is not thread-safe per its docstring, so we serialise
|
|
18
|
+
/// access through a single in-flight task at a time. For multi-request
|
|
19
|
+
/// concurrency we'd need either a session pool or per-request session
|
|
20
|
+
/// instances; defer that to a follow-up.
|
|
21
|
+
private let session: ChatSession
|
|
22
|
+
|
|
23
|
+
public init(modelId: String, modelContainer: ModelContainer) {
|
|
24
|
+
self.modelId = modelId
|
|
25
|
+
self.modelContainer = modelContainer
|
|
26
|
+
self.session = ChatSession(modelContainer)
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
public func handleChatCompletion(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
|
|
30
|
+
let messages = (body["messages"] as? [[String: Any]]) ?? []
|
|
31
|
+
let prompt = Self.flattenMessagesToPrompt(messages)
|
|
32
|
+
let stream = (body["stream"] as? Bool) ?? false
|
|
33
|
+
|
|
34
|
+
if stream {
|
|
35
|
+
// SSE streaming: forward MLX's stream of partial responses as
|
|
36
|
+
// OpenAI-style chunked deltas. Each yield is one delta chunk.
|
|
37
|
+
return .sse(AsyncStream<String> { continuation in
|
|
38
|
+
Task { [session, modelId] in
|
|
39
|
+
do {
|
|
40
|
+
for try await partial in session.streamResponse(to: prompt) {
|
|
41
|
+
let delta: [String: Any] = [
|
|
42
|
+
"id": "mlx-\(UUID().uuidString.prefix(8))",
|
|
43
|
+
"object": "chat.completion.chunk",
|
|
44
|
+
"created": Int(Date().timeIntervalSince1970),
|
|
45
|
+
"model": modelId,
|
|
46
|
+
"choices": [[
|
|
47
|
+
"index": 0,
|
|
48
|
+
"delta": ["role": "assistant", "content": partial],
|
|
49
|
+
"finish_reason": NSNull(),
|
|
50
|
+
]],
|
|
51
|
+
]
|
|
52
|
+
if let data = try? JSONSerialization.data(withJSONObject: delta),
|
|
53
|
+
let str = String(data: data, encoding: .utf8) {
|
|
54
|
+
continuation.yield("data: \(str)\n\n")
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
continuation.yield("data: [DONE]\n\n")
|
|
58
|
+
continuation.finish()
|
|
59
|
+
} catch {
|
|
60
|
+
continuation.finish()
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
})
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
let reply = try await session.respond(to: prompt)
|
|
67
|
+
let json: [String: Any] = [
|
|
68
|
+
"id": "mlx-\(UUID().uuidString.prefix(8))",
|
|
69
|
+
"object": "chat.completion",
|
|
70
|
+
"created": Int(Date().timeIntervalSince1970),
|
|
71
|
+
"model": modelId,
|
|
72
|
+
"choices": [[
|
|
73
|
+
"index": 0,
|
|
74
|
+
"message": ["role": "assistant", "content": reply],
|
|
75
|
+
"finish_reason": "stop",
|
|
76
|
+
]],
|
|
77
|
+
]
|
|
78
|
+
return .json(200, json)
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
public func handleCompletion(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
|
|
82
|
+
let prompt = (body["prompt"] as? String) ?? ""
|
|
83
|
+
let reply = try await session.respond(to: prompt)
|
|
84
|
+
let json: [String: Any] = [
|
|
85
|
+
"id": "mlx-\(UUID().uuidString.prefix(8))",
|
|
86
|
+
"object": "text_completion",
|
|
87
|
+
"created": Int(Date().timeIntervalSince1970),
|
|
88
|
+
"model": modelId,
|
|
89
|
+
"choices": [[
|
|
90
|
+
"text": reply,
|
|
91
|
+
"index": 0,
|
|
92
|
+
"finish_reason": "stop",
|
|
93
|
+
]],
|
|
94
|
+
]
|
|
95
|
+
return .json(200, json)
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
public func handleEmbeddings(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
|
|
99
|
+
// Embeddings would need MLXEmbedders + a different model. Defer to
|
|
100
|
+
// the .llama or .coreml backends for now.
|
|
101
|
+
return .error(501, "MLX backend does not currently expose embeddings; use BackendKind.llama or .coreml for /v1/embeddings.")
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
public func handleModels(ctx: HandlerContext) async throws -> HandlerResponse {
|
|
105
|
+
let json: [String: Any] = [
|
|
106
|
+
"object": "list",
|
|
107
|
+
"data": [[
|
|
108
|
+
"id": modelId,
|
|
109
|
+
"object": "model",
|
|
110
|
+
"created": Int(Date().timeIntervalSince1970),
|
|
111
|
+
"owned_by": "mlx",
|
|
112
|
+
]],
|
|
113
|
+
]
|
|
114
|
+
return .json(200, json)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/// Flatten OpenAI-style chat messages into a single prompt string. The
|
|
118
|
+
/// underlying `ChatSession` carries its own conversational state, but
|
|
119
|
+
/// our HTTP surface is stateless (each request includes the whole
|
|
120
|
+
/// history), so we ignore the session's state and just submit the
|
|
121
|
+
/// concatenated turns. ChatSession will apply the model's chat template
|
|
122
|
+
/// to whatever prompt it receives.
|
|
123
|
+
private static func flattenMessagesToPrompt(_ messages: [[String: Any]]) -> String {
|
|
124
|
+
// Concatenate roles + content in a way compatible with most chat
|
|
125
|
+
// templates. The session re-applies the model's own template
|
|
126
|
+
// internally, so we just need to deliver the latest user turn
|
|
127
|
+
// along with prior context as part of the prompt body.
|
|
128
|
+
var lines: [String] = []
|
|
129
|
+
for msg in messages {
|
|
130
|
+
let role = (msg["role"] as? String) ?? "user"
|
|
131
|
+
let content = (msg["content"] as? String) ?? ""
|
|
132
|
+
lines.append("[\(role)]: \(content)")
|
|
133
|
+
}
|
|
134
|
+
return lines.joined(separator: "\n")
|
|
135
|
+
}
|
|
136
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
// MLXPluginState — lifecycle owner for the MLX backend.
|
|
2
|
+
//
|
|
3
|
+
// Mirrors FoundationPluginState's shape so DVAIBridge can swap among
|
|
4
|
+
// .llama / .foundation / .coreml / .mlx backends with the same
|
|
5
|
+
// start/stop/status surface.
|
|
6
|
+
//
|
|
7
|
+
// Notes:
|
|
8
|
+
// - MLX requires Apple Silicon at runtime; we don't gate that here
|
|
9
|
+
// because the underlying mlx-swift framework returns a clean error if
|
|
10
|
+
// the GPU is unavailable.
|
|
11
|
+
// - `modelPath` opt is a HuggingFace model id (e.g.
|
|
12
|
+
// "mlx-community/Llama-3.2-1B-Instruct-4bit"). The first call
|
|
13
|
+
// downloads weights into the user's HF cache; subsequent calls
|
|
14
|
+
// reuse them. Local-directory loads are a Phase 3D follow-up
|
|
15
|
+
// (the mlx-swift-lm 2.x convenience API takes only an HF id).
|
|
16
|
+
|
|
17
|
+
import Foundation
|
|
18
|
+
#if !COCOAPODS
|
|
19
|
+
import DVAISharedCore
|
|
20
|
+
#endif
|
|
21
|
+
import MLXLMCommon
|
|
22
|
+
// MLXLLM registers `LLMModelFactory` with the global ModelFactoryRegistry
|
|
23
|
+
// at load time. We never reference its types directly (loadModelContainer
|
|
24
|
+
// finds the factory via NSClassFromString → TrampolineModelFactory), but
|
|
25
|
+
// the import is required so the linker keeps its objects in the binary.
|
|
26
|
+
@_implementationOnly import MLXLLM
|
|
27
|
+
|
|
28
|
+
public actor MLXPluginState {
|
|
29
|
+
private var server: HttpServer?
|
|
30
|
+
private var handlers: MLXHandlers?
|
|
31
|
+
private(set) var modelId: String = ""
|
|
32
|
+
private(set) var isRunning: Bool = false
|
|
33
|
+
private(set) var baseUrl: String?
|
|
34
|
+
private(set) var port: Int?
|
|
35
|
+
|
|
36
|
+
public init() {}
|
|
37
|
+
|
|
38
|
+
public func start(opts: [String: Any]) async throws -> [String: Any] {
|
|
39
|
+
if isRunning { try await stopInternal() }
|
|
40
|
+
|
|
41
|
+
// Required: HF model id, e.g. "mlx-community/Llama-3.2-1B-Instruct-4bit".
|
|
42
|
+
guard let modelPath = opts["modelPath"] as? String, !modelPath.isEmpty else {
|
|
43
|
+
throw NSError(
|
|
44
|
+
domain: "DVAIBridgeMLX",
|
|
45
|
+
code: 400,
|
|
46
|
+
userInfo: [NSLocalizedDescriptionKey: "MLX backend requires a `modelPath` option (HuggingFace model id, e.g. \"mlx-community/Llama-3.2-1B-Instruct-4bit\")."]
|
|
47
|
+
)
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
let httpBasePort = opts["httpBasePort"] as? Int ?? 38883
|
|
51
|
+
let httpMaxPortAttempts = opts["httpMaxPortAttempts"] as? Int ?? 16
|
|
52
|
+
let corsRaw = opts["corsOrigin"]
|
|
53
|
+
let corsConfig = parseCors(corsRaw)
|
|
54
|
+
|
|
55
|
+
// Load the model from HF (cached on subsequent runs). This is the
|
|
56
|
+
// expensive call — can take 30–120s for first download depending
|
|
57
|
+
// on size and network; instant on cache hits.
|
|
58
|
+
let modelContainer: ModelContainer
|
|
59
|
+
do {
|
|
60
|
+
modelContainer = try await loadModelContainer(id: modelPath)
|
|
61
|
+
} catch {
|
|
62
|
+
throw NSError(
|
|
63
|
+
domain: "DVAIBridgeMLX",
|
|
64
|
+
code: 500,
|
|
65
|
+
userInfo: [NSLocalizedDescriptionKey: "MLX model load failed for \(modelPath): \(error.localizedDescription)"]
|
|
66
|
+
)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Build handlers first; Hummingbird requires routes to be
|
|
70
|
+
// installed at Application construction time, so installRoutes
|
|
71
|
+
// → tryBind is the mandatory order.
|
|
72
|
+
let handlers = MLXHandlers(modelId: modelPath, modelContainer: modelContainer)
|
|
73
|
+
let ctx = HandlerContext(modelId: modelPath, backendName: "mlx")
|
|
74
|
+
let server = HttpServer()
|
|
75
|
+
await server.installRoutes(handlers: handlers, ctx: ctx, corsConfig: corsConfig)
|
|
76
|
+
|
|
77
|
+
let port = try await server.tryBind(
|
|
78
|
+
basePort: httpBasePort,
|
|
79
|
+
maxAttempts: httpMaxPortAttempts,
|
|
80
|
+
host: "127.0.0.1"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
self.handlers = handlers
|
|
84
|
+
self.modelId = modelPath
|
|
85
|
+
self.server = server
|
|
86
|
+
self.port = port
|
|
87
|
+
self.baseUrl = "http://127.0.0.1:\(port)/v1"
|
|
88
|
+
self.isRunning = true
|
|
89
|
+
|
|
90
|
+
return [
|
|
91
|
+
"baseUrl": self.baseUrl!,
|
|
92
|
+
"port": port,
|
|
93
|
+
"backend": "mlx",
|
|
94
|
+
"modelId": modelPath,
|
|
95
|
+
]
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
public func stop() async throws {
|
|
99
|
+
try await stopInternal()
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
private func stopInternal() async throws {
|
|
103
|
+
await server?.stop()
|
|
104
|
+
server = nil
|
|
105
|
+
handlers = nil
|
|
106
|
+
modelId = ""
|
|
107
|
+
baseUrl = nil
|
|
108
|
+
port = nil
|
|
109
|
+
isRunning = false
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
public func statusInfo() -> [String: Any] {
|
|
113
|
+
var dict: [String: Any] = ["running": isRunning]
|
|
114
|
+
if let baseUrl = baseUrl { dict["baseUrl"] = baseUrl }
|
|
115
|
+
if isRunning { dict["backend"] = "mlx" }
|
|
116
|
+
return dict
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
private func parseCors(_ raw: Any?) -> CORSConfig {
|
|
120
|
+
if let s = raw as? String {
|
|
121
|
+
return s == "*" ? .wildcard : .exact(s)
|
|
122
|
+
}
|
|
123
|
+
if let arr = raw as? [String] {
|
|
124
|
+
return .allowlist(arr)
|
|
125
|
+
}
|
|
126
|
+
return .wildcard
|
|
127
|
+
}
|
|
128
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// Smoke test for DVAIMLXCore. Real model load is gated behind an
|
|
2
|
+
// env-var so CI runs (and dev machines without the model cached)
|
|
3
|
+
// don't hang for minutes downloading weights.
|
|
4
|
+
|
|
5
|
+
import XCTest
|
|
6
|
+
@testable import DVAIMLXCore
|
|
7
|
+
|
|
8
|
+
final class MLXPluginStateTest: XCTestCase {
|
|
9
|
+
func testStatusInfoBeforeStartReportsNotRunning() async {
|
|
10
|
+
let state = MLXPluginState()
|
|
11
|
+
let info = await state.statusInfo()
|
|
12
|
+
XCTAssertEqual(info["running"] as? Bool, false)
|
|
13
|
+
XCTAssertNil(info["backend"])
|
|
14
|
+
XCTAssertNil(info["baseUrl"])
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
func testStartWithoutModelPathThrows() async {
|
|
18
|
+
let state = MLXPluginState()
|
|
19
|
+
do {
|
|
20
|
+
_ = try await state.start(opts: [:])
|
|
21
|
+
XCTFail("expected start without modelPath to throw")
|
|
22
|
+
} catch {
|
|
23
|
+
XCTAssertTrue("\(error)".contains("modelPath"), "error should mention missing modelPath: \(error)")
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/// End-to-end test against a real MLX model. Skipped unless
|
|
28
|
+
/// `SMOKE_MLX_MODEL_ID` is set (e.g. "mlx-community/Llama-3.2-1B-Instruct-4bit").
|
|
29
|
+
/// The first run downloads weights into the user's HF cache (~700 MB
|
|
30
|
+
/// for the 1B-4bit). Subsequent runs hit the cache.
|
|
31
|
+
func testStartWithRealModel() async throws {
|
|
32
|
+
let env = ProcessInfo.processInfo.environment
|
|
33
|
+
guard let modelId = env["SMOKE_MLX_MODEL_ID"], !modelId.isEmpty else {
|
|
34
|
+
throw XCTSkip("SMOKE_MLX_MODEL_ID not set; skipping MLX real-model test")
|
|
35
|
+
}
|
|
36
|
+
let state = MLXPluginState()
|
|
37
|
+
let result = try await state.start(opts: ["modelPath": modelId])
|
|
38
|
+
defer { Task { try? await state.stop() } }
|
|
39
|
+
XCTAssertEqual(result["backend"] as? String, "mlx")
|
|
40
|
+
XCTAssertEqual(result["modelId"] as? String, modelId)
|
|
41
|
+
XCTAssertNotNil(result["baseUrl"])
|
|
42
|
+
}
|
|
43
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@dvai-bridge/ios-mlx-core",
|
|
3
|
+
"version": "4.0.0",
|
|
4
|
+
"description": "DVAI-Bridge iOS MLX core — Swift wrapper around Apple's MLX Swift LM (mlx-swift-lm) running LLM inference on Apple Silicon. Apple-Silicon-only at runtime; iOS 17+/macOS 14+ link-time minimum.",
|
|
5
|
+
"author": "Deep Chakraborty <https://github.com/dk013>",
|
|
6
|
+
"license": "Custom (See LICENSE)",
|
|
7
|
+
"main": "Package.swift",
|
|
8
|
+
"files": [
|
|
9
|
+
"Package.swift",
|
|
10
|
+
"ios",
|
|
11
|
+
"README.md",
|
|
12
|
+
"LICENSE"
|
|
13
|
+
],
|
|
14
|
+
"publishConfig": {
|
|
15
|
+
"registry": "https://registry.npmjs.org/",
|
|
16
|
+
"access": "public"
|
|
17
|
+
}
|
|
18
|
+
}
|