llm-coreml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ Metadata-Version: 2.3
2
+ Name: llm-coreml
3
+ Version: 0.1.0
4
+ Summary: A plugin for https://github.com/simonw/llm for running local CoreML .mlpackage model files.
5
+ Author: Anentropic
6
+ Author-email: Anentropic <ego@anentropic.com>
7
+ Requires-Dist: llm
8
+ Requires-Dist: coremltools>=8.0
9
+ Requires-Dist: transformers
10
+ Requires-Dist: numpy
11
+ Requires-Python: >=3.11
12
+ Description-Content-Type: text/markdown
13
+
14
+ # llm-coreml
15
+
16
+ A plugin for <https://llm.datasette.io/> cli tool that runs CoreML `.mlpackage` LLM models locally on macOS.
17
+
18
+ Point it at a model (and its corresponding HuggingFace tokenizer), then prompt it like any other `llm` model.
19
+
20
+ ## Requirements
21
+
22
+ - macOS (CoreML is Apple-only)
23
+ - Python 3.11+
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ llm install llm-coreml
29
+ ```
30
+
31
+ Or for development:
32
+
33
+ ```bash
34
+ git clone https://github.com/anentropic/llm-coreml.git
35
+ cd llm-coreml
36
+ llm install -e .
37
+ ```
38
+
39
+ ## Quick start
40
+
41
+ Register a model with a name and a path to the `.mlpackage`.
42
+
43
+ The `--tokenizer` argument is the HuggingFace model name to load the tokenizer from. This should match the HF model your `.mlpackage` was derived from:
44
+
45
+ ```bash
46
+ llm coreml add my-llama /path/to/llama.mlpackage \
47
+ --tokenizer meta-llama/Llama-3.2-1B-Instruct
48
+ ```
49
+
50
+ Prompt it:
51
+
52
+ ```bash
53
+ llm -m coreml/my-llama "Explain quantum computing in one sentence"
54
+ ```
55
+
56
+ Check it shows up in `llm models`:
57
+
58
+ ```bash
59
+ llm models | grep coreml
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ ### Prompting
65
+
66
+ ```bash
67
+ # Basic prompt
68
+ llm -m coreml/my-llama "What is Rust?"
69
+
70
+ # With a system prompt
71
+ llm -m coreml/my-llama "Hello" -s "You are a pirate"
72
+
73
+ # Continue a conversation
74
+ llm -m coreml/my-llama "What is Rust?"
75
+ llm -c "Compare it to Go"
76
+ ```
77
+
78
+ ### Model options
79
+
80
+ Pass options with `-o`:
81
+
82
+ ```bash
83
+ llm -m coreml/my-llama "Write a haiku" \
84
+ -o temperature 0.7 \
85
+ -o top_p 0.9 \
86
+ -o max_tokens 50
87
+ ```
88
+
89
+ ### Python API
90
+
91
+ ```python
92
+ import llm
93
+
94
+ model = llm.get_model("coreml/my-llama")
95
+ response = model.prompt("What is the capital of France?")
96
+ print(response.text())
97
+ ```
98
+
99
+ ## CLI reference
100
+
101
+ ### `llm coreml add`
102
+
103
+ ```
104
+ llm coreml add <name> <path> --tokenizer <hf_id> [--compute-units <units>]
105
+ ```
106
+
107
+ Register a CoreML model.
108
+
109
+ | Argument | Description |
110
+ |---|---|
111
+ | `name` | Model name, used as `coreml/<name>` |
112
+ | `path` | Path to the `.mlpackage` directory (resolved to absolute) |
113
+ | `--tokenizer` | HuggingFace tokenizer model ID (required) |
114
+ | `--compute-units` | Compute units: `all`, `cpu_only`, `cpu_and_gpu`, `cpu_and_ne` (default: `all`) |
115
+
116
+ ### `llm coreml list`
117
+
118
+ ```
119
+ llm coreml list
120
+ ```
121
+
122
+ Lists registered models with their paths, tokenizer IDs, and compute units.
123
+
124
+ ### `llm coreml remove`
125
+
126
+ ```
127
+ llm coreml remove <name>
128
+ ```
129
+
130
+ Removes a registered model. Exits with code 1 if the model doesn't exist.
131
+
132
+ ## Model options reference
133
+
134
+ | Option | Type | Default | Description |
135
+ |---|---|---|---|
136
+ | `max_tokens` | int | 200 | Maximum tokens to generate |
137
+ | `temperature` | float | 0.0 | Sampling temperature. 0 = greedy (deterministic) |
138
+ | `top_p` | float | 1.0 | Top-p nucleus sampling threshold |
139
+
140
+ ## How it works
141
+
142
+ ### Format auto-detection
143
+
144
+ The plugin reads the CoreML model spec at load time and checks the input names:
145
+
146
+ - `inputIds` (camelCase) = Apple format, uses float16 causal masks
147
+ - `input_ids` (snake_case) = HuggingFace format, uses int32 attention masks
148
+
149
+ No config file needed.
150
+
151
+ ### Stateful KV-cache
152
+
153
+ If the model spec declares `stateDescriptions`, the plugin uses stateful inference with KV-cache. Otherwise it falls back to stateless inference, which reprocesses the full sequence each step (slower, but works with older models).
154
+
155
+ ### Tokenization
156
+
157
+ The plugin uses `transformers.AutoTokenizer` with `apply_chat_template()` to handle chat formatting. The tokenizer is downloaded and cached the first time you use a model.
158
+
159
+ ## Getting CoreML models
160
+
161
+ You can get `.mlpackage` LLM models by:
162
+
163
+ - Converting HuggingFace models with [coremltools](https://apple.github.io/coremltools/docs-guides/)
164
+ - Using Apple's [ml-explore](https://github.com/ml-explore) tools
165
+ - Downloading pre-converted models from HuggingFace (search for "coreml" tagged models)
166
+
167
+ ## Development
168
+
169
+ ```bash
170
+ uv sync --dev
171
+ ```
172
+
173
+ ### Quality gates
174
+
175
+ ```bash
176
+ uv run basedpyright # Type checking (strict)
177
+ uv run ruff check # Linting
178
+ uv run ruff format # Formatting
179
+ uv run pytest # Tests
180
+ ```
181
+
182
+ Or all at once:
183
+
184
+ ```bash
185
+ prek run --all-files
186
+ ```
187
+
188
+ ## License
189
+
190
+ MIT
@@ -0,0 +1,177 @@
1
+ # llm-coreml
2
+
3
+ A plugin for <https://llm.datasette.io/> cli tool that runs CoreML `.mlpackage` LLM models locally on macOS.
4
+
5
+ Point it at a model (and its corresponding HuggingFace tokenizer), then prompt it like any other `llm` model.
6
+
7
+ ## Requirements
8
+
9
+ - macOS (CoreML is Apple-only)
10
+ - Python 3.11+
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ llm install llm-coreml
16
+ ```
17
+
18
+ Or for development:
19
+
20
+ ```bash
21
+ git clone https://github.com/anentropic/llm-coreml.git
22
+ cd llm-coreml
23
+ llm install -e .
24
+ ```
25
+
26
+ ## Quick start
27
+
28
+ Register a model with a name and a path to the `.mlpackage`.
29
+
30
+ The `--tokenizer` argument is the HuggingFace model name to load the tokenizer from. This should match the HF model your `.mlpackage` was derived from:
31
+
32
+ ```bash
33
+ llm coreml add my-llama /path/to/llama.mlpackage \
34
+ --tokenizer meta-llama/Llama-3.2-1B-Instruct
35
+ ```
36
+
37
+ Prompt it:
38
+
39
+ ```bash
40
+ llm -m coreml/my-llama "Explain quantum computing in one sentence"
41
+ ```
42
+
43
+ Check it shows up in `llm models`:
44
+
45
+ ```bash
46
+ llm models | grep coreml
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ ### Prompting
52
+
53
+ ```bash
54
+ # Basic prompt
55
+ llm -m coreml/my-llama "What is Rust?"
56
+
57
+ # With a system prompt
58
+ llm -m coreml/my-llama "Hello" -s "You are a pirate"
59
+
60
+ # Continue a conversation
61
+ llm -m coreml/my-llama "What is Rust?"
62
+ llm -c "Compare it to Go"
63
+ ```
64
+
65
+ ### Model options
66
+
67
+ Pass options with `-o`:
68
+
69
+ ```bash
70
+ llm -m coreml/my-llama "Write a haiku" \
71
+ -o temperature 0.7 \
72
+ -o top_p 0.9 \
73
+ -o max_tokens 50
74
+ ```
75
+
76
+ ### Python API
77
+
78
+ ```python
79
+ import llm
80
+
81
+ model = llm.get_model("coreml/my-llama")
82
+ response = model.prompt("What is the capital of France?")
83
+ print(response.text())
84
+ ```
85
+
86
+ ## CLI reference
87
+
88
+ ### `llm coreml add`
89
+
90
+ ```
91
+ llm coreml add <name> <path> --tokenizer <hf_id> [--compute-units <units>]
92
+ ```
93
+
94
+ Register a CoreML model.
95
+
96
+ | Argument | Description |
97
+ |---|---|
98
+ | `name` | Model name, used as `coreml/<name>` |
99
+ | `path` | Path to the `.mlpackage` directory (resolved to absolute) |
100
+ | `--tokenizer` | HuggingFace tokenizer model ID (required) |
101
+ | `--compute-units` | Compute units: `all`, `cpu_only`, `cpu_and_gpu`, `cpu_and_ne` (default: `all`) |
102
+
103
+ ### `llm coreml list`
104
+
105
+ ```
106
+ llm coreml list
107
+ ```
108
+
109
+ Lists registered models with their paths, tokenizer IDs, and compute units.
110
+
111
+ ### `llm coreml remove`
112
+
113
+ ```
114
+ llm coreml remove <name>
115
+ ```
116
+
117
+ Removes a registered model. Exits with code 1 if the model doesn't exist.
118
+
119
+ ## Model options reference
120
+
121
+ | Option | Type | Default | Description |
122
+ |---|---|---|---|
123
+ | `max_tokens` | int | 200 | Maximum tokens to generate |
124
+ | `temperature` | float | 0.0 | Sampling temperature. 0 = greedy (deterministic) |
125
+ | `top_p` | float | 1.0 | Top-p nucleus sampling threshold |
126
+
127
+ ## How it works
128
+
129
+ ### Format auto-detection
130
+
131
+ The plugin reads the CoreML model spec at load time and checks the input names:
132
+
133
+ - `inputIds` (camelCase) = Apple format, uses float16 causal masks
134
+ - `input_ids` (snake_case) = HuggingFace format, uses int32 attention masks
135
+
136
+ No config file needed.
137
+
138
+ ### Stateful KV-cache
139
+
140
+ If the model spec declares `stateDescriptions`, the plugin uses stateful inference with KV-cache. Otherwise it falls back to stateless inference, which reprocesses the full sequence each step (slower, but works with older models).
141
+
142
+ ### Tokenization
143
+
144
+ The plugin uses `transformers.AutoTokenizer` with `apply_chat_template()` to handle chat formatting. The tokenizer is downloaded and cached the first time you use a model.
145
+
146
+ ## Getting CoreML models
147
+
148
+ You can get `.mlpackage` LLM models by:
149
+
150
+ - Converting HuggingFace models with [coremltools](https://apple.github.io/coremltools/docs-guides/)
151
+ - Using Apple's [ml-explore](https://github.com/ml-explore) tools
152
+ - Downloading pre-converted models from HuggingFace (search for "coreml" tagged models)
153
+
154
+ ## Development
155
+
156
+ ```bash
157
+ uv sync --dev
158
+ ```
159
+
160
+ ### Quality gates
161
+
162
+ ```bash
163
+ uv run basedpyright # Type checking (strict)
164
+ uv run ruff check # Linting
165
+ uv run ruff format # Formatting
166
+ uv run pytest # Tests
167
+ ```
168
+
169
+ Or all at once:
170
+
171
+ ```bash
172
+ prek run --all-files
173
+ ```
174
+
175
+ ## License
176
+
177
+ MIT
@@ -0,0 +1,60 @@
1
+ [project]
2
+ name = "llm-coreml"
3
+ version = "0.1.0"
4
+ description = "A plugin for https://github.com/simonw/llm for running local CoreML .mlpackage model files."
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Anentropic", email = "ego@anentropic.com" }
8
+ ]
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "llm",
12
+ "coremltools>=8.0",
13
+ "transformers",
14
+ "numpy",
15
+ ]
16
+
17
+ [project.entry-points.llm]
18
+ llm_coreml = "llm_coreml"
19
+
20
+ [build-system]
21
+ requires = ["uv_build>=0.9.18,<1.0.0"]
22
+ build-backend = "uv_build"
23
+
24
+ [dependency-groups]
25
+ dev = [
26
+ "basedpyright>=1.38.0",
27
+ "ipython>=9.10.0",
28
+ "pdbpp>=0.12.0.post1",
29
+ "pytest>=8.0.0",
30
+ "pytest-cov>=6.0.0",
31
+ "ruff>=0.15.1",
32
+ ]
33
+ [tool.basedpyright]
34
+ pythonVersion = "3.14"
35
+ typeCheckingMode = "strict"
36
+ include = ["src", "tests"]
37
+ reportPrivateUsage = false
38
+ reportMissingTypeStubs = false
39
+
40
+ [tool.ruff]
41
+ target-version = "py311"
42
+ line-length = 100
43
+ src = ["src"]
44
+
45
+ [tool.ruff.lint]
46
+ select = ["E", "F", "W", "I", "UP", "B", "SIM", "TCH", "D"]
47
+ ignore = [
48
+ "D1", # don't require docstrings everywhere
49
+ "D202", # allow blank line after docstring (common in tests)
50
+ "D203", # conflicts with D211 (no blank line before class docstring)
51
+ "D212", # conflicts with D213 (multiline summary on second line)
52
+ "D401", # imperative mood is awkward for dunder methods
53
+ "D413", # blank line after last section not required
54
+ ]
55
+
56
+ [tool.pytest.ini_options]
57
+ markers = [
58
+ "integration: requires a real CoreML model (run 'just convert-test-model' first)",
59
+ ]
60
+ addopts = "-v"
@@ -0,0 +1,83 @@
1
+ """A plugin for llm that runs local CoreML .mlpackage model files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
7
+
8
+ import click
9
+ import llm
10
+
11
+ from llm_coreml.registry import add_model, list_models, remove_model
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Callable
15
+
16
+
17
+ @llm.hookimpl
18
+ def register_models(register: Callable[..., object]) -> None:
19
+ """Register all CoreML models from the local registry."""
20
+ from llm_coreml.model import CoreMLModel
21
+
22
+ for name, config in list_models().items():
23
+ register(
24
+ CoreMLModel(
25
+ model_id=f"coreml/{name}",
26
+ model_path=config["path"],
27
+ tokenizer_id=config["tokenizer"],
28
+ compute_units=config.get("compute_units", "all"),
29
+ ),
30
+ )
31
+
32
+
33
+ @llm.hookimpl
34
+ def register_commands(cli: click.Group) -> None:
35
+ """Register the `llm coreml` command group."""
36
+
37
+ @cli.group(name="coreml")
38
+ def coreml_group() -> None:
39
+ """Manage locally-registered CoreML models."""
40
+
41
+ @coreml_group.command(name="add")
42
+ @click.argument("name")
43
+ @click.argument("path", type=click.Path(exists=True))
44
+ @click.option("--tokenizer", required=True, help="HuggingFace tokenizer model ID")
45
+ @click.option(
46
+ "--compute-units",
47
+ type=click.Choice(["all", "cpu_only", "cpu_and_gpu", "cpu_and_ne"]),
48
+ default="all",
49
+ help="CoreML compute units to use (default: all)",
50
+ )
51
+ def add_cmd(name: str, path: str, tokenizer: str, compute_units: str) -> None: # pyright: ignore[reportUnusedFunction]
52
+ """
53
+ Register a CoreML model.
54
+
55
+ NAME is the model name (used as coreml/NAME).
56
+ PATH is the path to the .mlpackage directory.
57
+ """
58
+ resolved = str(Path(path).resolve())
59
+ add_model(name, resolved, tokenizer, compute_units)
60
+ click.echo(f"Added model coreml/{name}")
61
+
62
+ @coreml_group.command(name="list")
63
+ def list_cmd() -> None: # pyright: ignore[reportUnusedFunction]
64
+ """List registered CoreML models."""
65
+ models = list_models()
66
+ if not models:
67
+ click.echo("No CoreML models registered.")
68
+ return
69
+ for name, config in models.items():
70
+ click.echo(f"coreml/{name}")
71
+ click.echo(f" Path: {config['path']}")
72
+ click.echo(f" Tokenizer: {config['tokenizer']}")
73
+ click.echo(f" Compute units: {config.get('compute_units', 'all')}")
74
+
75
+ @coreml_group.command(name="remove")
76
+ @click.argument("name")
77
+ def remove_cmd(name: str) -> None: # pyright: ignore[reportUnusedFunction]
78
+ """Remove a registered CoreML model."""
79
+ if remove_model(name):
80
+ click.echo(f"Removed model coreml/{name}")
81
+ else:
82
+ click.echo(f"Model '{name}' not found.", err=True)
83
+ raise SystemExit(1)
@@ -0,0 +1,251 @@
1
+ """CoreML inference engine with format auto-detection and autoregressive generation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import enum
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ import numpy as np
9
+
10
+ if TYPE_CHECKING:
11
+ from collections.abc import Iterator
12
+
13
+
14
+ class ModelFormat(enum.Enum):
15
+ """Detected input format of a CoreML model."""
16
+
17
+ APPLE = "apple"
18
+ HUGGINGFACE = "huggingface"
19
+
20
+
21
+ def detect_format(spec: Any) -> ModelFormat:
22
+ """Detect whether a model uses Apple (camelCase) or HuggingFace (snake_case) inputs."""
23
+ input_names: set[str] = {inp.name for inp in spec.description.input}
24
+ if "inputIds" in input_names:
25
+ return ModelFormat.APPLE
26
+ if "input_ids" in input_names:
27
+ return ModelFormat.HUGGINGFACE
28
+ msg = f"Cannot detect model format. Input names: {input_names}"
29
+ raise ValueError(msg)
30
+
31
+
32
+ def is_stateful(spec: Any) -> bool:
33
+ """Check if the model supports stateful KV-cache inference."""
34
+ state_descs = getattr(spec.description, "stateDescriptions", None)
35
+ return state_descs is not None and len(state_descs) > 0
36
+
37
+
38
+ class CoreMLInferenceEngine:
39
+ """Loads a CoreML model and runs autoregressive text generation."""
40
+
41
+ COMPUTE_UNIT_MAP: dict[str, str] = {
42
+ "all": "ALL",
43
+ "cpu_only": "CPU_ONLY",
44
+ "cpu_and_gpu": "CPU_AND_GPU",
45
+ "cpu_and_ne": "CPU_AND_NE",
46
+ }
47
+
48
+ def __init__(self, model_path: str, compute_units: str = "all") -> None:
49
+ self.model_path = model_path
50
+ self.compute_units = compute_units
51
+ self._model: Any = None
52
+ self._format: ModelFormat | None = None
53
+ self._stateful: bool | None = None
54
+
55
+ def _load(self) -> Any:
56
+ if self._model is not None:
57
+ return self._model
58
+
59
+ import coremltools as ct # pyright: ignore[reportMissingTypeStubs]
60
+
61
+ cu_name = self.COMPUTE_UNIT_MAP.get(self.compute_units, "ALL")
62
+ cu = getattr(ct.ComputeUnit, cu_name)
63
+ self._model = ct.models.MLModel(
64
+ self.model_path,
65
+ compute_units=cu,
66
+ )
67
+ spec = self._model.get_spec()
68
+ self._format = detect_format(spec)
69
+ self._stateful = is_stateful(spec)
70
+ return self._model
71
+
72
+ @property
73
+ def format(self) -> ModelFormat:
74
+ """Model input format (loads model if needed)."""
75
+ self._load()
76
+ assert self._format is not None
77
+ return self._format
78
+
79
+ @property
80
+ def stateful(self) -> bool:
81
+ """Whether the model supports stateful KV-cache."""
82
+ self._load()
83
+ assert self._stateful is not None
84
+ return self._stateful
85
+
86
+ def generate(
87
+ self,
88
+ input_ids: list[int],
89
+ *,
90
+ max_tokens: int = 200,
91
+ temperature: float = 0.0,
92
+ top_p: float = 1.0,
93
+ eos_token_id: int | None = None,
94
+ ) -> Iterator[int]:
95
+ """Generate tokens autoregressively, yielding one token ID at a time."""
96
+ model = self._load()
97
+
98
+ if self._stateful:
99
+ yield from self._generate_stateful(
100
+ model,
101
+ input_ids,
102
+ max_tokens=max_tokens,
103
+ temperature=temperature,
104
+ top_p=top_p,
105
+ eos_token_id=eos_token_id,
106
+ )
107
+ else:
108
+ yield from self._generate_stateless(
109
+ model,
110
+ input_ids,
111
+ max_tokens=max_tokens,
112
+ temperature=temperature,
113
+ top_p=top_p,
114
+ eos_token_id=eos_token_id,
115
+ )
116
+
117
+ def _generate_stateful(
118
+ self,
119
+ model: Any,
120
+ input_ids: list[int],
121
+ *,
122
+ max_tokens: int,
123
+ temperature: float,
124
+ top_p: float,
125
+ eos_token_id: int | None,
126
+ ) -> Iterator[int]:
127
+ state = model.make_state()
128
+
129
+ # Prefill: process all prompt tokens
130
+ output: dict[str, np.ndarray] = {}
131
+ for i, token_id in enumerate(input_ids):
132
+ feed = self._make_input(
133
+ [token_id],
134
+ seq_len=len(input_ids),
135
+ position=i,
136
+ )
137
+ output = model.predict(feed, state=state)
138
+
139
+ # Decode
140
+ next_token = self._extract_next_token(output, temperature, top_p)
141
+ for _ in range(max_tokens):
142
+ if eos_token_id is not None and next_token == eos_token_id:
143
+ break
144
+ yield next_token
145
+
146
+ feed = self._make_input(
147
+ [next_token],
148
+ seq_len=len(input_ids) + 1,
149
+ position=len(input_ids),
150
+ )
151
+ input_ids.append(next_token)
152
+ output = model.predict(feed, state=state)
153
+ next_token = self._extract_next_token(output, temperature, top_p)
154
+
155
+ def _generate_stateless(
156
+ self,
157
+ model: Any,
158
+ input_ids: list[int],
159
+ *,
160
+ max_tokens: int,
161
+ temperature: float,
162
+ top_p: float,
163
+ eos_token_id: int | None,
164
+ ) -> Iterator[int]:
165
+ seq = list(input_ids)
166
+ for _ in range(max_tokens):
167
+ feed = self._make_input(seq, seq_len=len(seq), position=0)
168
+ output: dict[str, np.ndarray] = model.predict(feed)
169
+ next_token = self._extract_next_token(output, temperature, top_p)
170
+ if eos_token_id is not None and next_token == eos_token_id:
171
+ break
172
+ yield next_token
173
+ seq.append(next_token)
174
+
175
+ def _make_input(
176
+ self,
177
+ token_ids: list[int],
178
+ *,
179
+ seq_len: int,
180
+ position: int,
181
+ ) -> dict[str, np.ndarray]:
182
+ assert self._format is not None
183
+ ids = np.array([token_ids], dtype=np.int32)
184
+
185
+ if self._format == ModelFormat.APPLE:
186
+ return {
187
+ "inputIds": ids,
188
+ "causalMask": _build_causal_mask(len(token_ids), seq_len),
189
+ }
190
+ return {
191
+ "input_ids": ids,
192
+ "attention_mask": _build_attention_mask(seq_len),
193
+ }
194
+
195
+ def _extract_next_token(
196
+ self,
197
+ output: dict[str, np.ndarray],
198
+ temperature: float,
199
+ top_p: float,
200
+ ) -> int:
201
+ logits_key = "logits" if "logits" in output else next(iter(output))
202
+ logits = np.array(output[logits_key], dtype=np.float32)
203
+ # Take last token's logits
204
+ if logits.ndim == 3:
205
+ logits = logits[0, -1, :]
206
+ elif logits.ndim == 2:
207
+ logits = logits[-1, :]
208
+ return sample_token(logits, temperature=temperature, top_p=top_p)
209
+
210
+
211
+ def _build_causal_mask(query_len: int, kv_len: int) -> np.ndarray:
212
+ """Build a float16 causal mask for Apple-format models."""
213
+ mask = np.full((1, 1, query_len, kv_len), -np.inf, dtype=np.float16)
214
+ for i in range(query_len):
215
+ pos = kv_len - query_len + i
216
+ mask[0, 0, i, : pos + 1] = 0.0
217
+ return mask
218
+
219
+
220
+ def _build_attention_mask(seq_len: int) -> np.ndarray:
221
+ """Build an int32 attention mask for HuggingFace-format models."""
222
+ return np.ones((1, seq_len), dtype=np.int32)
223
+
224
+
225
+ def sample_token(
226
+ logits: np.ndarray,
227
+ *,
228
+ temperature: float = 0.0,
229
+ top_p: float = 1.0,
230
+ ) -> int:
231
+ """Sample a token from logits using temperature and top-p nucleus sampling."""
232
+ if temperature <= 0:
233
+ return int(np.argmax(logits))
234
+
235
+ logits = logits / temperature
236
+ # Numerical stability
237
+ logits = logits - np.max(logits)
238
+ probs = np.exp(logits) / np.sum(np.exp(logits))
239
+
240
+ if top_p < 1.0:
241
+ sorted_indices = np.argsort(-probs)
242
+ sorted_probs = probs[sorted_indices]
243
+ cumulative = np.cumsum(sorted_probs)
244
+ cutoff = np.searchsorted(cumulative, top_p) + 1
245
+ # Zero out tokens beyond cutoff
246
+ mask = np.zeros_like(probs)
247
+ mask[sorted_indices[:cutoff]] = 1.0
248
+ probs = probs * mask
249
+ probs = probs / np.sum(probs)
250
+
251
+ return int(np.random.choice(len(probs), p=probs))
@@ -0,0 +1,117 @@
1
+ """CoreML model implementation for llm."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ import llm
8
+ from pydantic import Field
9
+
10
+ from llm_coreml.inference import CoreMLInferenceEngine
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Iterator
14
+
15
+
16
+ class CoreMLOptions(llm.Options):
17
+ """Options for CoreML model inference."""
18
+
19
+ max_tokens: int | None = Field(
20
+ description="Maximum number of tokens to generate",
21
+ default=200,
22
+ )
23
+ temperature: float | None = Field(
24
+ description="Sampling temperature (0 for greedy)",
25
+ default=0.0,
26
+ )
27
+ top_p: float | None = Field(
28
+ description="Top-p nucleus sampling threshold",
29
+ default=1.0,
30
+ )
31
+
32
+
33
+ class CoreMLModel(llm.Model):
34
+ """An llm Model backed by a local CoreML .mlpackage."""
35
+
36
+ can_stream = True
37
+ model_id: str
38
+ model_path: str
39
+ tokenizer_id: str
40
+ Options = CoreMLOptions # type: ignore[assignment]
41
+
42
+ def __init__(
43
+ self,
44
+ model_id: str,
45
+ model_path: str,
46
+ tokenizer_id: str,
47
+ compute_units: str = "all",
48
+ ) -> None:
49
+ self.model_id = model_id
50
+ self.model_path = model_path
51
+ self.tokenizer_id = tokenizer_id
52
+ self.compute_units = compute_units
53
+ self._engine: CoreMLInferenceEngine | None = None
54
+ self._tokenizer: Any = None
55
+
56
+ def _get_engine(self) -> CoreMLInferenceEngine:
57
+ if self._engine is None:
58
+ self._engine = CoreMLInferenceEngine(self.model_path, self.compute_units)
59
+ return self._engine
60
+
61
+ def _get_tokenizer(self) -> Any:
62
+ if self._tokenizer is None:
63
+ from transformers import AutoTokenizer # pyright: ignore[reportMissingTypeStubs]
64
+
65
+ self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id) # pyright: ignore[reportUnknownMemberType]
66
+ return self._tokenizer # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType]
67
+
68
+ def execute(
69
+ self,
70
+ prompt: llm.Prompt,
71
+ stream: bool,
72
+ response: llm.Response,
73
+ conversation: llm.Conversation | None,
74
+ ) -> Iterator[str]:
75
+ tokenizer = self._get_tokenizer()
76
+ engine = self._get_engine()
77
+
78
+ messages = _build_messages(prompt, conversation)
79
+ input_ids: list[int] = tokenizer.apply_chat_template(
80
+ messages,
81
+ add_generation_prompt=True,
82
+ )
83
+
84
+ opts: CoreMLOptions = prompt.options # type: ignore[assignment]
85
+ token_count = 0
86
+ for token_id in engine.generate(
87
+ input_ids,
88
+ max_tokens=opts.max_tokens or 200,
89
+ temperature=opts.temperature or 0.0,
90
+ top_p=opts.top_p or 1.0,
91
+ eos_token_id=tokenizer.eos_token_id,
92
+ ):
93
+ token_count += 1
94
+ yield tokenizer.decode([token_id]) # type: ignore[no-any-return]
95
+
96
+ response.set_usage(input=len(input_ids), output=token_count) # pyright: ignore[reportUnknownMemberType]
97
+
98
+
99
+ def _build_messages(
100
+ prompt: llm.Prompt,
101
+ conversation: llm.Conversation | None,
102
+ ) -> list[dict[str, str]]:
103
+ """Reconstruct a chat message list from the conversation history."""
104
+ messages: list[dict[str, str]] = []
105
+
106
+ if prompt.system:
107
+ messages.append({"role": "system", "content": prompt.system})
108
+
109
+ if conversation is not None:
110
+ for prev in conversation.responses:
111
+ messages.append({"role": "user", "content": prev.prompt.prompt or ""})
112
+ messages.append(
113
+ {"role": "assistant", "content": prev.text() or ""}, # type: ignore[union-attr]
114
+ )
115
+
116
+ messages.append({"role": "user", "content": prompt.prompt or ""})
117
+ return messages
File without changes
@@ -0,0 +1,57 @@
1
+ """JSON registry for CoreML model configurations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import llm
10
+
11
+
12
+ def get_registry_path() -> Path:
13
+ """Return the path to the registry JSON file."""
14
+ return Path(llm.user_dir()) / "llm-coreml.json"
15
+
16
+
17
+ def _read_registry() -> dict[str, dict[str, str]]:
18
+ path = get_registry_path()
19
+ if not path.exists():
20
+ return {}
21
+ return json.loads(path.read_text()) # type: ignore[no-any-return]
22
+
23
+
24
+ def _write_registry(data: dict[str, Any]) -> None:
25
+ path = get_registry_path()
26
+ path.write_text(json.dumps(data, indent=2) + "\n")
27
+
28
+
29
+ def add_model(name: str, path: str, tokenizer: str, compute_units: str = "all") -> None:
30
+ """Register a model with the given name, path, tokenizer ID, and compute units."""
31
+ registry = _read_registry()
32
+ registry[name] = {
33
+ "path": path,
34
+ "tokenizer": tokenizer,
35
+ "compute_units": compute_units,
36
+ }
37
+ _write_registry(registry)
38
+
39
+
40
+ def remove_model(name: str) -> bool:
41
+ """Remove a model by name. Returns True if it existed."""
42
+ registry = _read_registry()
43
+ if name not in registry:
44
+ return False
45
+ del registry[name]
46
+ _write_registry(registry)
47
+ return True
48
+
49
+
50
+ def list_models() -> dict[str, dict[str, str]]:
51
+ """Return all registered models."""
52
+ return _read_registry()
53
+
54
+
55
+ def get_model(name: str) -> dict[str, str] | None:
56
+ """Return config for a single model, or None if not found."""
57
+ return _read_registry().get(name)