llm-coreml 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_coreml-0.1.0/PKG-INFO +190 -0
- llm_coreml-0.1.0/README.md +177 -0
- llm_coreml-0.1.0/pyproject.toml +60 -0
- llm_coreml-0.1.0/src/llm_coreml/__init__.py +83 -0
- llm_coreml-0.1.0/src/llm_coreml/inference.py +251 -0
- llm_coreml-0.1.0/src/llm_coreml/model.py +117 -0
- llm_coreml-0.1.0/src/llm_coreml/py.typed +0 -0
- llm_coreml-0.1.0/src/llm_coreml/registry.py +57 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: llm-coreml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A plugin for https://github.com/simonw/llm for running local CoreML .mlpackage model files.
|
|
5
|
+
Author: Anentropic
|
|
6
|
+
Author-email: Anentropic <ego@anentropic.com>
|
|
7
|
+
Requires-Dist: llm
|
|
8
|
+
Requires-Dist: coremltools>=8.0
|
|
9
|
+
Requires-Dist: transformers
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Python: >=3.11
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# llm-coreml
|
|
15
|
+
|
|
16
|
+
A plugin for <https://llm.datasette.io/> cli tool that runs CoreML `.mlpackage` LLM models locally on macOS.
|
|
17
|
+
|
|
18
|
+
Point it at a model (and its corresponding HuggingFace tokenizer), then prompt it like any other `llm` model.
|
|
19
|
+
|
|
20
|
+
## Requirements
|
|
21
|
+
|
|
22
|
+
- macOS (CoreML is Apple-only)
|
|
23
|
+
- Python 3.11+
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
llm install llm-coreml
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Or for development:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
git clone https://github.com/anentropic/llm-coreml.git
|
|
35
|
+
cd llm-coreml
|
|
36
|
+
llm install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick start
|
|
40
|
+
|
|
41
|
+
Register a model with a name and a path to the `.mlpackage`.
|
|
42
|
+
|
|
43
|
+
The `--tokenizer` argument is the HuggingFace model name to load the tokenizer from. This should match the HF model your `.mlpackage` was derived from:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
llm coreml add my-llama /path/to/llama.mlpackage \
|
|
47
|
+
--tokenizer meta-llama/Llama-3.2-1B-Instruct
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Prompt it:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
llm -m coreml/my-llama "Explain quantum computing in one sentence"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Check it shows up in `llm models`:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
llm models | grep coreml
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Usage
|
|
63
|
+
|
|
64
|
+
### Prompting
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Basic prompt
|
|
68
|
+
llm -m coreml/my-llama "What is Rust?"
|
|
69
|
+
|
|
70
|
+
# With a system prompt
|
|
71
|
+
llm -m coreml/my-llama "Hello" -s "You are a pirate"
|
|
72
|
+
|
|
73
|
+
# Continue a conversation
|
|
74
|
+
llm -m coreml/my-llama "What is Rust?"
|
|
75
|
+
llm -c "Compare it to Go"
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Model options
|
|
79
|
+
|
|
80
|
+
Pass options with `-o`:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
llm -m coreml/my-llama "Write a haiku" \
|
|
84
|
+
-o temperature 0.7 \
|
|
85
|
+
-o top_p 0.9 \
|
|
86
|
+
-o max_tokens 50
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Python API
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
import llm
|
|
93
|
+
|
|
94
|
+
model = llm.get_model("coreml/my-llama")
|
|
95
|
+
response = model.prompt("What is the capital of France?")
|
|
96
|
+
print(response.text())
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## CLI reference
|
|
100
|
+
|
|
101
|
+
### `llm coreml add`
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
llm coreml add <name> <path> --tokenizer <hf_id> [--compute-units <units>]
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Register a CoreML model.
|
|
108
|
+
|
|
109
|
+
| Argument | Description |
|
|
110
|
+
|---|---|
|
|
111
|
+
| `name` | Model name, used as `coreml/<name>` |
|
|
112
|
+
| `path` | Path to the `.mlpackage` directory (resolved to absolute) |
|
|
113
|
+
| `--tokenizer` | HuggingFace tokenizer model ID (required) |
|
|
114
|
+
| `--compute-units` | Compute units: `all`, `cpu_only`, `cpu_and_gpu`, `cpu_and_ne` (default: `all`) |
|
|
115
|
+
|
|
116
|
+
### `llm coreml list`
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
llm coreml list
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Lists registered models with their paths, tokenizer IDs, and compute units.
|
|
123
|
+
|
|
124
|
+
### `llm coreml remove`
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
llm coreml remove <name>
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Removes a registered model. Exits with code 1 if the model doesn't exist.
|
|
131
|
+
|
|
132
|
+
## Model options reference
|
|
133
|
+
|
|
134
|
+
| Option | Type | Default | Description |
|
|
135
|
+
|---|---|---|---|
|
|
136
|
+
| `max_tokens` | int | 200 | Maximum tokens to generate |
|
|
137
|
+
| `temperature` | float | 0.0 | Sampling temperature. 0 = greedy (deterministic) |
|
|
138
|
+
| `top_p` | float | 1.0 | Top-p nucleus sampling threshold |
|
|
139
|
+
|
|
140
|
+
## How it works
|
|
141
|
+
|
|
142
|
+
### Format auto-detection
|
|
143
|
+
|
|
144
|
+
The plugin reads the CoreML model spec at load time and checks the input names:
|
|
145
|
+
|
|
146
|
+
- `inputIds` (camelCase) = Apple format, uses float16 causal masks
|
|
147
|
+
- `input_ids` (snake_case) = HuggingFace format, uses int32 attention masks
|
|
148
|
+
|
|
149
|
+
No config file needed.
|
|
150
|
+
|
|
151
|
+
### Stateful KV-cache
|
|
152
|
+
|
|
153
|
+
If the model spec declares `stateDescriptions`, the plugin uses stateful inference with KV-cache. Otherwise it falls back to stateless inference, which reprocesses the full sequence each step (slower, but works with older models).
|
|
154
|
+
|
|
155
|
+
### Tokenization
|
|
156
|
+
|
|
157
|
+
The plugin uses `transformers.AutoTokenizer` with `apply_chat_template()` to handle chat formatting. The tokenizer is downloaded and cached the first time you use a model.
|
|
158
|
+
|
|
159
|
+
## Getting CoreML models
|
|
160
|
+
|
|
161
|
+
You can get `.mlpackage` LLM models by:
|
|
162
|
+
|
|
163
|
+
- Converting HuggingFace models with [coremltools](https://apple.github.io/coremltools/docs-guides/)
|
|
164
|
+
- Using Apple's [ml-explore](https://github.com/ml-explore) tools
|
|
165
|
+
- Downloading pre-converted models from HuggingFace (search for "coreml" tagged models)
|
|
166
|
+
|
|
167
|
+
## Development
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
uv sync --dev
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Quality gates
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
uv run basedpyright # Type checking (strict)
|
|
177
|
+
uv run ruff check # Linting
|
|
178
|
+
uv run ruff format # Formatting
|
|
179
|
+
uv run pytest # Tests
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Or all at once:
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
prek run --all-files
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## License
|
|
189
|
+
|
|
190
|
+
MIT
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# llm-coreml
|
|
2
|
+
|
|
3
|
+
A plugin for <https://llm.datasette.io/> cli tool that runs CoreML `.mlpackage` LLM models locally on macOS.
|
|
4
|
+
|
|
5
|
+
Point it at a model (and its corresponding HuggingFace tokenizer), then prompt it like any other `llm` model.
|
|
6
|
+
|
|
7
|
+
## Requirements
|
|
8
|
+
|
|
9
|
+
- macOS (CoreML is Apple-only)
|
|
10
|
+
- Python 3.11+
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
llm install llm-coreml
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Or for development:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
git clone https://github.com/anentropic/llm-coreml.git
|
|
22
|
+
cd llm-coreml
|
|
23
|
+
llm install -e .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick start
|
|
27
|
+
|
|
28
|
+
Register a model with a name and a path to the `.mlpackage`.
|
|
29
|
+
|
|
30
|
+
The `--tokenizer` argument is the HuggingFace model name to load the tokenizer from. This should match the HF model your `.mlpackage` was derived from:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
llm coreml add my-llama /path/to/llama.mlpackage \
|
|
34
|
+
--tokenizer meta-llama/Llama-3.2-1B-Instruct
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Prompt it:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
llm -m coreml/my-llama "Explain quantum computing in one sentence"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Check it shows up in `llm models`:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
llm models | grep coreml
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
### Prompting
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# Basic prompt
|
|
55
|
+
llm -m coreml/my-llama "What is Rust?"
|
|
56
|
+
|
|
57
|
+
# With a system prompt
|
|
58
|
+
llm -m coreml/my-llama "Hello" -s "You are a pirate"
|
|
59
|
+
|
|
60
|
+
# Continue a conversation
|
|
61
|
+
llm -m coreml/my-llama "What is Rust?"
|
|
62
|
+
llm -c "Compare it to Go"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Model options
|
|
66
|
+
|
|
67
|
+
Pass options with `-o`:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
llm -m coreml/my-llama "Write a haiku" \
|
|
71
|
+
-o temperature 0.7 \
|
|
72
|
+
-o top_p 0.9 \
|
|
73
|
+
-o max_tokens 50
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Python API
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import llm
|
|
80
|
+
|
|
81
|
+
model = llm.get_model("coreml/my-llama")
|
|
82
|
+
response = model.prompt("What is the capital of France?")
|
|
83
|
+
print(response.text())
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## CLI reference
|
|
87
|
+
|
|
88
|
+
### `llm coreml add`
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
llm coreml add <name> <path> --tokenizer <hf_id> [--compute-units <units>]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Register a CoreML model.
|
|
95
|
+
|
|
96
|
+
| Argument | Description |
|
|
97
|
+
|---|---|
|
|
98
|
+
| `name` | Model name, used as `coreml/<name>` |
|
|
99
|
+
| `path` | Path to the `.mlpackage` directory (resolved to absolute) |
|
|
100
|
+
| `--tokenizer` | HuggingFace tokenizer model ID (required) |
|
|
101
|
+
| `--compute-units` | Compute units: `all`, `cpu_only`, `cpu_and_gpu`, `cpu_and_ne` (default: `all`) |
|
|
102
|
+
|
|
103
|
+
### `llm coreml list`
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
llm coreml list
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Lists registered models with their paths, tokenizer IDs, and compute units.
|
|
110
|
+
|
|
111
|
+
### `llm coreml remove`
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
llm coreml remove <name>
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Removes a registered model. Exits with code 1 if the model doesn't exist.
|
|
118
|
+
|
|
119
|
+
## Model options reference
|
|
120
|
+
|
|
121
|
+
| Option | Type | Default | Description |
|
|
122
|
+
|---|---|---|---|
|
|
123
|
+
| `max_tokens` | int | 200 | Maximum tokens to generate |
|
|
124
|
+
| `temperature` | float | 0.0 | Sampling temperature. 0 = greedy (deterministic) |
|
|
125
|
+
| `top_p` | float | 1.0 | Top-p nucleus sampling threshold |
|
|
126
|
+
|
|
127
|
+
## How it works
|
|
128
|
+
|
|
129
|
+
### Format auto-detection
|
|
130
|
+
|
|
131
|
+
The plugin reads the CoreML model spec at load time and checks the input names:
|
|
132
|
+
|
|
133
|
+
- `inputIds` (camelCase) = Apple format, uses float16 causal masks
|
|
134
|
+
- `input_ids` (snake_case) = HuggingFace format, uses int32 attention masks
|
|
135
|
+
|
|
136
|
+
No config file needed.
|
|
137
|
+
|
|
138
|
+
### Stateful KV-cache
|
|
139
|
+
|
|
140
|
+
If the model spec declares `stateDescriptions`, the plugin uses stateful inference with KV-cache. Otherwise it falls back to stateless inference, which reprocesses the full sequence each step (slower, but works with older models).
|
|
141
|
+
|
|
142
|
+
### Tokenization
|
|
143
|
+
|
|
144
|
+
The plugin uses `transformers.AutoTokenizer` with `apply_chat_template()` to handle chat formatting. The tokenizer is downloaded and cached the first time you use a model.
|
|
145
|
+
|
|
146
|
+
## Getting CoreML models
|
|
147
|
+
|
|
148
|
+
You can get `.mlpackage` LLM models by:
|
|
149
|
+
|
|
150
|
+
- Converting HuggingFace models with [coremltools](https://apple.github.io/coremltools/docs-guides/)
|
|
151
|
+
- Using Apple's [ml-explore](https://github.com/ml-explore) tools
|
|
152
|
+
- Downloading pre-converted models from HuggingFace (search for "coreml" tagged models)
|
|
153
|
+
|
|
154
|
+
## Development
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
uv sync --dev
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Quality gates
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
uv run basedpyright # Type checking (strict)
|
|
164
|
+
uv run ruff check # Linting
|
|
165
|
+
uv run ruff format # Formatting
|
|
166
|
+
uv run pytest # Tests
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Or all at once:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
prek run --all-files
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## License
|
|
176
|
+
|
|
177
|
+
MIT
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "llm-coreml"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A plugin for https://github.com/simonw/llm for running local CoreML .mlpackage model files."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Anentropic", email = "ego@anentropic.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"llm",
|
|
12
|
+
"coremltools>=8.0",
|
|
13
|
+
"transformers",
|
|
14
|
+
"numpy",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.entry-points.llm]
|
|
18
|
+
llm_coreml = "llm_coreml"
|
|
19
|
+
|
|
20
|
+
[build-system]
|
|
21
|
+
requires = ["uv_build>=0.9.18,<1.0.0"]
|
|
22
|
+
build-backend = "uv_build"
|
|
23
|
+
|
|
24
|
+
[dependency-groups]
|
|
25
|
+
dev = [
|
|
26
|
+
"basedpyright>=1.38.0",
|
|
27
|
+
"ipython>=9.10.0",
|
|
28
|
+
"pdbpp>=0.12.0.post1",
|
|
29
|
+
"pytest>=8.0.0",
|
|
30
|
+
"pytest-cov>=6.0.0",
|
|
31
|
+
"ruff>=0.15.1",
|
|
32
|
+
]
|
|
33
|
+
[tool.basedpyright]
|
|
34
|
+
pythonVersion = "3.14"
|
|
35
|
+
typeCheckingMode = "strict"
|
|
36
|
+
include = ["src", "tests"]
|
|
37
|
+
reportPrivateUsage = false
|
|
38
|
+
reportMissingTypeStubs = false
|
|
39
|
+
|
|
40
|
+
[tool.ruff]
|
|
41
|
+
target-version = "py311"
|
|
42
|
+
line-length = 100
|
|
43
|
+
src = ["src"]
|
|
44
|
+
|
|
45
|
+
[tool.ruff.lint]
|
|
46
|
+
select = ["E", "F", "W", "I", "UP", "B", "SIM", "TCH", "D"]
|
|
47
|
+
ignore = [
|
|
48
|
+
"D1", # don't require docstrings everywhere
|
|
49
|
+
"D202", # allow blank line after docstring (common in tests)
|
|
50
|
+
"D203", # conflicts with D211 (no blank line before class docstring)
|
|
51
|
+
"D212", # conflicts with D213 (multiline summary on second line)
|
|
52
|
+
"D401", # imperative mood is awkward for dunder methods
|
|
53
|
+
"D413", # blank line after last section not required
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
[tool.pytest.ini_options]
|
|
57
|
+
markers = [
|
|
58
|
+
"integration: requires a real CoreML model (run 'just convert-test-model' first)",
|
|
59
|
+
]
|
|
60
|
+
addopts = "-v"
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""A plugin for llm that runs local CoreML .mlpackage model files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
import llm
|
|
10
|
+
|
|
11
|
+
from llm_coreml.registry import add_model, list_models, remove_model
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@llm.hookimpl
|
|
18
|
+
def register_models(register: Callable[..., object]) -> None:
|
|
19
|
+
"""Register all CoreML models from the local registry."""
|
|
20
|
+
from llm_coreml.model import CoreMLModel
|
|
21
|
+
|
|
22
|
+
for name, config in list_models().items():
|
|
23
|
+
register(
|
|
24
|
+
CoreMLModel(
|
|
25
|
+
model_id=f"coreml/{name}",
|
|
26
|
+
model_path=config["path"],
|
|
27
|
+
tokenizer_id=config["tokenizer"],
|
|
28
|
+
compute_units=config.get("compute_units", "all"),
|
|
29
|
+
),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@llm.hookimpl
|
|
34
|
+
def register_commands(cli: click.Group) -> None:
|
|
35
|
+
"""Register the `llm coreml` command group."""
|
|
36
|
+
|
|
37
|
+
@cli.group(name="coreml")
|
|
38
|
+
def coreml_group() -> None:
|
|
39
|
+
"""Manage locally-registered CoreML models."""
|
|
40
|
+
|
|
41
|
+
@coreml_group.command(name="add")
|
|
42
|
+
@click.argument("name")
|
|
43
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
44
|
+
@click.option("--tokenizer", required=True, help="HuggingFace tokenizer model ID")
|
|
45
|
+
@click.option(
|
|
46
|
+
"--compute-units",
|
|
47
|
+
type=click.Choice(["all", "cpu_only", "cpu_and_gpu", "cpu_and_ne"]),
|
|
48
|
+
default="all",
|
|
49
|
+
help="CoreML compute units to use (default: all)",
|
|
50
|
+
)
|
|
51
|
+
def add_cmd(name: str, path: str, tokenizer: str, compute_units: str) -> None: # pyright: ignore[reportUnusedFunction]
|
|
52
|
+
"""
|
|
53
|
+
Register a CoreML model.
|
|
54
|
+
|
|
55
|
+
NAME is the model name (used as coreml/NAME).
|
|
56
|
+
PATH is the path to the .mlpackage directory.
|
|
57
|
+
"""
|
|
58
|
+
resolved = str(Path(path).resolve())
|
|
59
|
+
add_model(name, resolved, tokenizer, compute_units)
|
|
60
|
+
click.echo(f"Added model coreml/{name}")
|
|
61
|
+
|
|
62
|
+
@coreml_group.command(name="list")
|
|
63
|
+
def list_cmd() -> None: # pyright: ignore[reportUnusedFunction]
|
|
64
|
+
"""List registered CoreML models."""
|
|
65
|
+
models = list_models()
|
|
66
|
+
if not models:
|
|
67
|
+
click.echo("No CoreML models registered.")
|
|
68
|
+
return
|
|
69
|
+
for name, config in models.items():
|
|
70
|
+
click.echo(f"coreml/{name}")
|
|
71
|
+
click.echo(f" Path: {config['path']}")
|
|
72
|
+
click.echo(f" Tokenizer: {config['tokenizer']}")
|
|
73
|
+
click.echo(f" Compute units: {config.get('compute_units', 'all')}")
|
|
74
|
+
|
|
75
|
+
@coreml_group.command(name="remove")
|
|
76
|
+
@click.argument("name")
|
|
77
|
+
def remove_cmd(name: str) -> None: # pyright: ignore[reportUnusedFunction]
|
|
78
|
+
"""Remove a registered CoreML model."""
|
|
79
|
+
if remove_model(name):
|
|
80
|
+
click.echo(f"Removed model coreml/{name}")
|
|
81
|
+
else:
|
|
82
|
+
click.echo(f"Model '{name}' not found.", err=True)
|
|
83
|
+
raise SystemExit(1)
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""CoreML inference engine with format auto-detection and autoregressive generation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import enum
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from collections.abc import Iterator
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ModelFormat(enum.Enum):
|
|
15
|
+
"""Detected input format of a CoreML model."""
|
|
16
|
+
|
|
17
|
+
APPLE = "apple"
|
|
18
|
+
HUGGINGFACE = "huggingface"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def detect_format(spec: Any) -> ModelFormat:
|
|
22
|
+
"""Detect whether a model uses Apple (camelCase) or HuggingFace (snake_case) inputs."""
|
|
23
|
+
input_names: set[str] = {inp.name for inp in spec.description.input}
|
|
24
|
+
if "inputIds" in input_names:
|
|
25
|
+
return ModelFormat.APPLE
|
|
26
|
+
if "input_ids" in input_names:
|
|
27
|
+
return ModelFormat.HUGGINGFACE
|
|
28
|
+
msg = f"Cannot detect model format. Input names: {input_names}"
|
|
29
|
+
raise ValueError(msg)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def is_stateful(spec: Any) -> bool:
|
|
33
|
+
"""Check if the model supports stateful KV-cache inference."""
|
|
34
|
+
state_descs = getattr(spec.description, "stateDescriptions", None)
|
|
35
|
+
return state_descs is not None and len(state_descs) > 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class CoreMLInferenceEngine:
|
|
39
|
+
"""Loads a CoreML model and runs autoregressive text generation."""
|
|
40
|
+
|
|
41
|
+
COMPUTE_UNIT_MAP: dict[str, str] = {
|
|
42
|
+
"all": "ALL",
|
|
43
|
+
"cpu_only": "CPU_ONLY",
|
|
44
|
+
"cpu_and_gpu": "CPU_AND_GPU",
|
|
45
|
+
"cpu_and_ne": "CPU_AND_NE",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def __init__(self, model_path: str, compute_units: str = "all") -> None:
|
|
49
|
+
self.model_path = model_path
|
|
50
|
+
self.compute_units = compute_units
|
|
51
|
+
self._model: Any = None
|
|
52
|
+
self._format: ModelFormat | None = None
|
|
53
|
+
self._stateful: bool | None = None
|
|
54
|
+
|
|
55
|
+
def _load(self) -> Any:
|
|
56
|
+
if self._model is not None:
|
|
57
|
+
return self._model
|
|
58
|
+
|
|
59
|
+
import coremltools as ct # pyright: ignore[reportMissingTypeStubs]
|
|
60
|
+
|
|
61
|
+
cu_name = self.COMPUTE_UNIT_MAP.get(self.compute_units, "ALL")
|
|
62
|
+
cu = getattr(ct.ComputeUnit, cu_name)
|
|
63
|
+
self._model = ct.models.MLModel(
|
|
64
|
+
self.model_path,
|
|
65
|
+
compute_units=cu,
|
|
66
|
+
)
|
|
67
|
+
spec = self._model.get_spec()
|
|
68
|
+
self._format = detect_format(spec)
|
|
69
|
+
self._stateful = is_stateful(spec)
|
|
70
|
+
return self._model
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def format(self) -> ModelFormat:
|
|
74
|
+
"""Model input format (loads model if needed)."""
|
|
75
|
+
self._load()
|
|
76
|
+
assert self._format is not None
|
|
77
|
+
return self._format
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def stateful(self) -> bool:
|
|
81
|
+
"""Whether the model supports stateful KV-cache."""
|
|
82
|
+
self._load()
|
|
83
|
+
assert self._stateful is not None
|
|
84
|
+
return self._stateful
|
|
85
|
+
|
|
86
|
+
def generate(
|
|
87
|
+
self,
|
|
88
|
+
input_ids: list[int],
|
|
89
|
+
*,
|
|
90
|
+
max_tokens: int = 200,
|
|
91
|
+
temperature: float = 0.0,
|
|
92
|
+
top_p: float = 1.0,
|
|
93
|
+
eos_token_id: int | None = None,
|
|
94
|
+
) -> Iterator[int]:
|
|
95
|
+
"""Generate tokens autoregressively, yielding one token ID at a time."""
|
|
96
|
+
model = self._load()
|
|
97
|
+
|
|
98
|
+
if self._stateful:
|
|
99
|
+
yield from self._generate_stateful(
|
|
100
|
+
model,
|
|
101
|
+
input_ids,
|
|
102
|
+
max_tokens=max_tokens,
|
|
103
|
+
temperature=temperature,
|
|
104
|
+
top_p=top_p,
|
|
105
|
+
eos_token_id=eos_token_id,
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
yield from self._generate_stateless(
|
|
109
|
+
model,
|
|
110
|
+
input_ids,
|
|
111
|
+
max_tokens=max_tokens,
|
|
112
|
+
temperature=temperature,
|
|
113
|
+
top_p=top_p,
|
|
114
|
+
eos_token_id=eos_token_id,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def _generate_stateful(
|
|
118
|
+
self,
|
|
119
|
+
model: Any,
|
|
120
|
+
input_ids: list[int],
|
|
121
|
+
*,
|
|
122
|
+
max_tokens: int,
|
|
123
|
+
temperature: float,
|
|
124
|
+
top_p: float,
|
|
125
|
+
eos_token_id: int | None,
|
|
126
|
+
) -> Iterator[int]:
|
|
127
|
+
state = model.make_state()
|
|
128
|
+
|
|
129
|
+
# Prefill: process all prompt tokens
|
|
130
|
+
output: dict[str, np.ndarray] = {}
|
|
131
|
+
for i, token_id in enumerate(input_ids):
|
|
132
|
+
feed = self._make_input(
|
|
133
|
+
[token_id],
|
|
134
|
+
seq_len=len(input_ids),
|
|
135
|
+
position=i,
|
|
136
|
+
)
|
|
137
|
+
output = model.predict(feed, state=state)
|
|
138
|
+
|
|
139
|
+
# Decode
|
|
140
|
+
next_token = self._extract_next_token(output, temperature, top_p)
|
|
141
|
+
for _ in range(max_tokens):
|
|
142
|
+
if eos_token_id is not None and next_token == eos_token_id:
|
|
143
|
+
break
|
|
144
|
+
yield next_token
|
|
145
|
+
|
|
146
|
+
feed = self._make_input(
|
|
147
|
+
[next_token],
|
|
148
|
+
seq_len=len(input_ids) + 1,
|
|
149
|
+
position=len(input_ids),
|
|
150
|
+
)
|
|
151
|
+
input_ids.append(next_token)
|
|
152
|
+
output = model.predict(feed, state=state)
|
|
153
|
+
next_token = self._extract_next_token(output, temperature, top_p)
|
|
154
|
+
|
|
155
|
+
def _generate_stateless(
|
|
156
|
+
self,
|
|
157
|
+
model: Any,
|
|
158
|
+
input_ids: list[int],
|
|
159
|
+
*,
|
|
160
|
+
max_tokens: int,
|
|
161
|
+
temperature: float,
|
|
162
|
+
top_p: float,
|
|
163
|
+
eos_token_id: int | None,
|
|
164
|
+
) -> Iterator[int]:
|
|
165
|
+
seq = list(input_ids)
|
|
166
|
+
for _ in range(max_tokens):
|
|
167
|
+
feed = self._make_input(seq, seq_len=len(seq), position=0)
|
|
168
|
+
output: dict[str, np.ndarray] = model.predict(feed)
|
|
169
|
+
next_token = self._extract_next_token(output, temperature, top_p)
|
|
170
|
+
if eos_token_id is not None and next_token == eos_token_id:
|
|
171
|
+
break
|
|
172
|
+
yield next_token
|
|
173
|
+
seq.append(next_token)
|
|
174
|
+
|
|
175
|
+
def _make_input(
|
|
176
|
+
self,
|
|
177
|
+
token_ids: list[int],
|
|
178
|
+
*,
|
|
179
|
+
seq_len: int,
|
|
180
|
+
position: int,
|
|
181
|
+
) -> dict[str, np.ndarray]:
|
|
182
|
+
assert self._format is not None
|
|
183
|
+
ids = np.array([token_ids], dtype=np.int32)
|
|
184
|
+
|
|
185
|
+
if self._format == ModelFormat.APPLE:
|
|
186
|
+
return {
|
|
187
|
+
"inputIds": ids,
|
|
188
|
+
"causalMask": _build_causal_mask(len(token_ids), seq_len),
|
|
189
|
+
}
|
|
190
|
+
return {
|
|
191
|
+
"input_ids": ids,
|
|
192
|
+
"attention_mask": _build_attention_mask(seq_len),
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
def _extract_next_token(
|
|
196
|
+
self,
|
|
197
|
+
output: dict[str, np.ndarray],
|
|
198
|
+
temperature: float,
|
|
199
|
+
top_p: float,
|
|
200
|
+
) -> int:
|
|
201
|
+
logits_key = "logits" if "logits" in output else next(iter(output))
|
|
202
|
+
logits = np.array(output[logits_key], dtype=np.float32)
|
|
203
|
+
# Take last token's logits
|
|
204
|
+
if logits.ndim == 3:
|
|
205
|
+
logits = logits[0, -1, :]
|
|
206
|
+
elif logits.ndim == 2:
|
|
207
|
+
logits = logits[-1, :]
|
|
208
|
+
return sample_token(logits, temperature=temperature, top_p=top_p)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _build_causal_mask(query_len: int, kv_len: int) -> np.ndarray:
|
|
212
|
+
"""Build a float16 causal mask for Apple-format models."""
|
|
213
|
+
mask = np.full((1, 1, query_len, kv_len), -np.inf, dtype=np.float16)
|
|
214
|
+
for i in range(query_len):
|
|
215
|
+
pos = kv_len - query_len + i
|
|
216
|
+
mask[0, 0, i, : pos + 1] = 0.0
|
|
217
|
+
return mask
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _build_attention_mask(seq_len: int) -> np.ndarray:
|
|
221
|
+
"""Build an int32 attention mask for HuggingFace-format models."""
|
|
222
|
+
return np.ones((1, seq_len), dtype=np.int32)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def sample_token(
|
|
226
|
+
logits: np.ndarray,
|
|
227
|
+
*,
|
|
228
|
+
temperature: float = 0.0,
|
|
229
|
+
top_p: float = 1.0,
|
|
230
|
+
) -> int:
|
|
231
|
+
"""Sample a token from logits using temperature and top-p nucleus sampling."""
|
|
232
|
+
if temperature <= 0:
|
|
233
|
+
return int(np.argmax(logits))
|
|
234
|
+
|
|
235
|
+
logits = logits / temperature
|
|
236
|
+
# Numerical stability
|
|
237
|
+
logits = logits - np.max(logits)
|
|
238
|
+
probs = np.exp(logits) / np.sum(np.exp(logits))
|
|
239
|
+
|
|
240
|
+
if top_p < 1.0:
|
|
241
|
+
sorted_indices = np.argsort(-probs)
|
|
242
|
+
sorted_probs = probs[sorted_indices]
|
|
243
|
+
cumulative = np.cumsum(sorted_probs)
|
|
244
|
+
cutoff = np.searchsorted(cumulative, top_p) + 1
|
|
245
|
+
# Zero out tokens beyond cutoff
|
|
246
|
+
mask = np.zeros_like(probs)
|
|
247
|
+
mask[sorted_indices[:cutoff]] = 1.0
|
|
248
|
+
probs = probs * mask
|
|
249
|
+
probs = probs / np.sum(probs)
|
|
250
|
+
|
|
251
|
+
return int(np.random.choice(len(probs), p=probs))
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""CoreML model implementation for llm."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
import llm
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from llm_coreml.inference import CoreMLInferenceEngine
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Iterator
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CoreMLOptions(llm.Options):
|
|
17
|
+
"""Options for CoreML model inference."""
|
|
18
|
+
|
|
19
|
+
max_tokens: int | None = Field(
|
|
20
|
+
description="Maximum number of tokens to generate",
|
|
21
|
+
default=200,
|
|
22
|
+
)
|
|
23
|
+
temperature: float | None = Field(
|
|
24
|
+
description="Sampling temperature (0 for greedy)",
|
|
25
|
+
default=0.0,
|
|
26
|
+
)
|
|
27
|
+
top_p: float | None = Field(
|
|
28
|
+
description="Top-p nucleus sampling threshold",
|
|
29
|
+
default=1.0,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CoreMLModel(llm.Model):
|
|
34
|
+
"""An llm Model backed by a local CoreML .mlpackage."""
|
|
35
|
+
|
|
36
|
+
can_stream = True
|
|
37
|
+
model_id: str
|
|
38
|
+
model_path: str
|
|
39
|
+
tokenizer_id: str
|
|
40
|
+
Options = CoreMLOptions # type: ignore[assignment]
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
model_id: str,
|
|
45
|
+
model_path: str,
|
|
46
|
+
tokenizer_id: str,
|
|
47
|
+
compute_units: str = "all",
|
|
48
|
+
) -> None:
|
|
49
|
+
self.model_id = model_id
|
|
50
|
+
self.model_path = model_path
|
|
51
|
+
self.tokenizer_id = tokenizer_id
|
|
52
|
+
self.compute_units = compute_units
|
|
53
|
+
self._engine: CoreMLInferenceEngine | None = None
|
|
54
|
+
self._tokenizer: Any = None
|
|
55
|
+
|
|
56
|
+
def _get_engine(self) -> CoreMLInferenceEngine:
|
|
57
|
+
if self._engine is None:
|
|
58
|
+
self._engine = CoreMLInferenceEngine(self.model_path, self.compute_units)
|
|
59
|
+
return self._engine
|
|
60
|
+
|
|
61
|
+
def _get_tokenizer(self) -> Any:
|
|
62
|
+
if self._tokenizer is None:
|
|
63
|
+
from transformers import AutoTokenizer # pyright: ignore[reportMissingTypeStubs]
|
|
64
|
+
|
|
65
|
+
self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id) # pyright: ignore[reportUnknownMemberType]
|
|
66
|
+
return self._tokenizer # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType]
|
|
67
|
+
|
|
68
|
+
def execute(
|
|
69
|
+
self,
|
|
70
|
+
prompt: llm.Prompt,
|
|
71
|
+
stream: bool,
|
|
72
|
+
response: llm.Response,
|
|
73
|
+
conversation: llm.Conversation | None,
|
|
74
|
+
) -> Iterator[str]:
|
|
75
|
+
tokenizer = self._get_tokenizer()
|
|
76
|
+
engine = self._get_engine()
|
|
77
|
+
|
|
78
|
+
messages = _build_messages(prompt, conversation)
|
|
79
|
+
input_ids: list[int] = tokenizer.apply_chat_template(
|
|
80
|
+
messages,
|
|
81
|
+
add_generation_prompt=True,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
opts: CoreMLOptions = prompt.options # type: ignore[assignment]
|
|
85
|
+
token_count = 0
|
|
86
|
+
for token_id in engine.generate(
|
|
87
|
+
input_ids,
|
|
88
|
+
max_tokens=opts.max_tokens or 200,
|
|
89
|
+
temperature=opts.temperature or 0.0,
|
|
90
|
+
top_p=opts.top_p or 1.0,
|
|
91
|
+
eos_token_id=tokenizer.eos_token_id,
|
|
92
|
+
):
|
|
93
|
+
token_count += 1
|
|
94
|
+
yield tokenizer.decode([token_id]) # type: ignore[no-any-return]
|
|
95
|
+
|
|
96
|
+
response.set_usage(input=len(input_ids), output=token_count) # pyright: ignore[reportUnknownMemberType]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _build_messages(
|
|
100
|
+
prompt: llm.Prompt,
|
|
101
|
+
conversation: llm.Conversation | None,
|
|
102
|
+
) -> list[dict[str, str]]:
|
|
103
|
+
"""Reconstruct a chat message list from the conversation history."""
|
|
104
|
+
messages: list[dict[str, str]] = []
|
|
105
|
+
|
|
106
|
+
if prompt.system:
|
|
107
|
+
messages.append({"role": "system", "content": prompt.system})
|
|
108
|
+
|
|
109
|
+
if conversation is not None:
|
|
110
|
+
for prev in conversation.responses:
|
|
111
|
+
messages.append({"role": "user", "content": prev.prompt.prompt or ""})
|
|
112
|
+
messages.append(
|
|
113
|
+
{"role": "assistant", "content": prev.text() or ""}, # type: ignore[union-attr]
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
messages.append({"role": "user", "content": prompt.prompt or ""})
|
|
117
|
+
return messages
|
|
File without changes
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""JSON registry for CoreML model configurations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import llm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_registry_path() -> Path:
|
|
13
|
+
"""Return the path to the registry JSON file."""
|
|
14
|
+
return Path(llm.user_dir()) / "llm-coreml.json"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _read_registry() -> dict[str, dict[str, str]]:
|
|
18
|
+
path = get_registry_path()
|
|
19
|
+
if not path.exists():
|
|
20
|
+
return {}
|
|
21
|
+
return json.loads(path.read_text()) # type: ignore[no-any-return]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _write_registry(data: dict[str, Any]) -> None:
|
|
25
|
+
path = get_registry_path()
|
|
26
|
+
path.write_text(json.dumps(data, indent=2) + "\n")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def add_model(name: str, path: str, tokenizer: str, compute_units: str = "all") -> None:
|
|
30
|
+
"""Register a model with the given name, path, tokenizer ID, and compute units."""
|
|
31
|
+
registry = _read_registry()
|
|
32
|
+
registry[name] = {
|
|
33
|
+
"path": path,
|
|
34
|
+
"tokenizer": tokenizer,
|
|
35
|
+
"compute_units": compute_units,
|
|
36
|
+
}
|
|
37
|
+
_write_registry(registry)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def remove_model(name: str) -> bool:
|
|
41
|
+
"""Remove a model by name. Returns True if it existed."""
|
|
42
|
+
registry = _read_registry()
|
|
43
|
+
if name not in registry:
|
|
44
|
+
return False
|
|
45
|
+
del registry[name]
|
|
46
|
+
_write_registry(registry)
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def list_models() -> dict[str, dict[str, str]]:
|
|
51
|
+
"""Return all registered models."""
|
|
52
|
+
return _read_registry()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_model(name: str) -> dict[str, str] | None:
|
|
56
|
+
"""Return config for a single model, or None if not found."""
|
|
57
|
+
return _read_registry().get(name)
|