mtplx 0.1.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mtplx-0.1.0rc1/CITATION.cff +18 -0
- mtplx-0.1.0rc1/LICENSE +201 -0
- mtplx-0.1.0rc1/MANIFEST.in +1 -0
- mtplx-0.1.0rc1/NOTICE +13 -0
- mtplx-0.1.0rc1/PKG-INFO +413 -0
- mtplx-0.1.0rc1/README.md +368 -0
- mtplx-0.1.0rc1/mtplx/__init__.py +18 -0
- mtplx-0.1.0rc1/mtplx/adaptive.py +212 -0
- mtplx-0.1.0rc1/mtplx/artifacts.py +748 -0
- mtplx-0.1.0rc1/mtplx/attention_split.py +399 -0
- mtplx-0.1.0rc1/mtplx/backends/__init__.py +59 -0
- mtplx-0.1.0rc1/mtplx/backends/deepseek_mtp.py +54 -0
- mtplx-0.1.0rc1/mtplx/backends/glm_mtp.py +53 -0
- mtplx-0.1.0rc1/mtplx/backends/mimo_mtp.py +46 -0
- mtplx-0.1.0rc1/mtplx/backends/nemotron_h_mtp.py +47 -0
- mtplx-0.1.0rc1/mtplx/backends/qwen3_next.py +58 -0
- mtplx-0.1.0rc1/mtplx/backends/registry.py +1090 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/__init__.py +1 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/prompts/calibration_coding.jsonl +24 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/prompts/default.jsonl +6 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/prompts/flappy.jsonl +1 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/prompts/long_code.jsonl +1 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/prompts/long_code_uncapped.jsonl +1 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/prompts/python_modules_long.jsonl +1 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/__init__.py +1 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/batch_equivalence.py +203 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/capture_commit_equivalence.py +262 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/competitor_baselines.py +363 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/contract_probe.py +117 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/harness.py +35 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/mtp1_gate.py +174 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/mtp1_sampler_smoke.py +205 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/mtp_adaptive.py +306 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/mtp_chain_probe.py +410 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/mtp_depth_grid.py +323 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/mtp_depth_sweep.py +608 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/mtp_tree_probe.py +347 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/multi_qmv_probe.py +287 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/preflight.py +135 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/runtime_smoke.py +51 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/session_bank.py +234 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/truth.py +509 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/verify_profile.py +366 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/verify_qmm_probe.py +357 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/runners/verify_ratio.py +76 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/schema.py +132 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/validators/__init__.py +1 -0
- mtplx-0.1.0rc1/mtplx/benchmarks/validators/basic.py +138 -0
- mtplx-0.1.0rc1/mtplx/block_attention.py +93 -0
- mtplx-0.1.0rc1/mtplx/cache_state.py +2157 -0
- mtplx-0.1.0rc1/mtplx/cli.py +2853 -0
- mtplx-0.1.0rc1/mtplx/commands/__init__.py +1 -0
- mtplx-0.1.0rc1/mtplx/commands/public.py +5658 -0
- mtplx-0.1.0rc1/mtplx/config.py +127 -0
- mtplx-0.1.0rc1/mtplx/constants.py +82 -0
- mtplx-0.1.0rc1/mtplx/correctors/__init__.py +50 -0
- mtplx-0.1.0rc1/mtplx/correctors/diagonal_affine.py +297 -0
- mtplx-0.1.0rc1/mtplx/correctors/low_rank.py +160 -0
- mtplx-0.1.0rc1/mtplx/deepseek_mtp_patch.py +413 -0
- mtplx-0.1.0rc1/mtplx/diagnostics.py +618 -0
- mtplx-0.1.0rc1/mtplx/draft_lm_head.py +203 -0
- mtplx-0.1.0rc1/mtplx/draft_sampling.py +41 -0
- mtplx-0.1.0rc1/mtplx/engine_session.py +370 -0
- mtplx-0.1.0rc1/mtplx/env.py +93 -0
- mtplx-0.1.0rc1/mtplx/errors.py +86 -0
- mtplx-0.1.0rc1/mtplx/fast_sampling.py +211 -0
- mtplx-0.1.0rc1/mtplx/gdn_capture.py +1832 -0
- mtplx-0.1.0rc1/mtplx/generation.py +4212 -0
- mtplx-0.1.0rc1/mtplx/glm_mtp_patch.py +400 -0
- mtplx-0.1.0rc1/mtplx/graphbank.py +581 -0
- mtplx-0.1.0rc1/mtplx/hf_loader.py +454 -0
- mtplx-0.1.0rc1/mtplx/kernels/__init__.py +35 -0
- mtplx-0.1.0rc1/mtplx/kernels/copy_leaf.py +57 -0
- mtplx-0.1.0rc1/mtplx/kernels/fused_norm.py +278 -0
- mtplx-0.1.0rc1/mtplx/kernels/lm_head_topk.py +532 -0
- mtplx-0.1.0rc1/mtplx/kernels/logits_topk.py +271 -0
- mtplx-0.1.0rc1/mtplx/kernels/native_gdn_tail.py +108 -0
- mtplx-0.1.0rc1/mtplx/kernels/sdpa_2pass.py +364 -0
- mtplx-0.1.0rc1/mtplx/kernels/sdpa_2pass_paged.py +492 -0
- mtplx-0.1.0rc1/mtplx/kernels/verify_mlp_fused.py +932 -0
- mtplx-0.1.0rc1/mtplx/kpi/__init__.py +39 -0
- mtplx-0.1.0rc1/mtplx/kpi/reference_vllm.py +212 -0
- mtplx-0.1.0rc1/mtplx/kpi/runtime_kpis.py +476 -0
- mtplx-0.1.0rc1/mtplx/mimo_mtp_patch.py +300 -0
- mtplx-0.1.0rc1/mtplx/mtp_adapters.py +490 -0
- mtplx-0.1.0rc1/mtplx/mtp_patch.py +677 -0
- mtplx-0.1.0rc1/mtplx/native_mlp.py +276 -0
- mtplx-0.1.0rc1/mtplx/nemotron_h_mtp_patch.py +381 -0
- mtplx-0.1.0rc1/mtplx/profiles.py +283 -0
- mtplx-0.1.0rc1/mtplx/proposal_reranker.py +186 -0
- mtplx-0.1.0rc1/mtplx/runtime.py +205 -0
- mtplx-0.1.0rc1/mtplx/sampling.py +235 -0
- mtplx-0.1.0rc1/mtplx/server/__init__.py +5 -0
- mtplx-0.1.0rc1/mtplx/server/openai.py +4598 -0
- mtplx-0.1.0rc1/mtplx/session_bank.py +528 -0
- mtplx-0.1.0rc1/mtplx/speculative.py +21 -0
- mtplx-0.1.0rc1/mtplx/thermal.py +1500 -0
- mtplx-0.1.0rc1/mtplx/thermal_sidecar.py +141 -0
- mtplx-0.1.0rc1/mtplx/trace_parity.py +335 -0
- mtplx-0.1.0rc1/mtplx/turboquant.py +124 -0
- mtplx-0.1.0rc1/mtplx/ui/__init__.py +24 -0
- mtplx-0.1.0rc1/mtplx/ui/banner.py +75 -0
- mtplx-0.1.0rc1/mtplx/ui/chat_printer.py +171 -0
- mtplx-0.1.0rc1/mtplx/ui/download_progress.py +260 -0
- mtplx-0.1.0rc1/mtplx/ui/onboarding.py +1486 -0
- mtplx-0.1.0rc1/mtplx/ui/panels.py +94 -0
- mtplx-0.1.0rc1/mtplx/ui/progress.py +131 -0
- mtplx-0.1.0rc1/mtplx/verify_qmv.py +1412 -0
- mtplx-0.1.0rc1/mtplx/version.py +6 -0
- mtplx-0.1.0rc1/mtplx.egg-info/PKG-INFO +413 -0
- mtplx-0.1.0rc1/mtplx.egg-info/SOURCES.txt +145 -0
- mtplx-0.1.0rc1/mtplx.egg-info/dependency_links.txt +1 -0
- mtplx-0.1.0rc1/mtplx.egg-info/entry_points.txt +2 -0
- mtplx-0.1.0rc1/mtplx.egg-info/requires.txt +24 -0
- mtplx-0.1.0rc1/mtplx.egg-info/top_level.txt +1 -0
- mtplx-0.1.0rc1/pyproject.toml +71 -0
- mtplx-0.1.0rc1/setup.cfg +4 -0
- mtplx-0.1.0rc1/tests/test_adaptive.py +88 -0
- mtplx-0.1.0rc1/tests/test_artifacts.py +1213 -0
- mtplx-0.1.0rc1/tests/test_cache_state.py +782 -0
- mtplx-0.1.0rc1/tests/test_config.py +75 -0
- mtplx-0.1.0rc1/tests/test_context_degradation_profiles.py +470 -0
- mtplx-0.1.0rc1/tests/test_correctors.py +133 -0
- mtplx-0.1.0rc1/tests/test_diagnostics.py +66 -0
- mtplx-0.1.0rc1/tests/test_download_progress.py +183 -0
- mtplx-0.1.0rc1/tests/test_draft_lm_head.py +37 -0
- mtplx-0.1.0rc1/tests/test_hf_loader.py +188 -0
- mtplx-0.1.0rc1/tests/test_hygiene_scan.py +49 -0
- mtplx-0.1.0rc1/tests/test_max_idle_watchdog.py +189 -0
- mtplx-0.1.0rc1/tests/test_max_lifecycle.py +157 -0
- mtplx-0.1.0rc1/tests/test_mtp_patch.py +47 -0
- mtplx-0.1.0rc1/tests/test_no_mlx_imports.py +291 -0
- mtplx-0.1.0rc1/tests/test_onboarding.py +792 -0
- mtplx-0.1.0rc1/tests/test_openai_bridge.py +576 -0
- mtplx-0.1.0rc1/tests/test_phase0h_paged_verifier_exactness.py +51 -0
- mtplx-0.1.0rc1/tests/test_profiles.py +52 -0
- mtplx-0.1.0rc1/tests/test_prompt_encoding.py +43 -0
- mtplx-0.1.0rc1/tests/test_public_cli.py +2020 -0
- mtplx-0.1.0rc1/tests/test_runtime_kpis.py +64 -0
- mtplx-0.1.0rc1/tests/test_sampling.py +78 -0
- mtplx-0.1.0rc1/tests/test_server_openai.py +225 -0
- mtplx-0.1.0rc1/tests/test_thermal.py +699 -0
- mtplx-0.1.0rc1/tests/test_thermal_sidecar.py +240 -0
- mtplx-0.1.0rc1/tests/test_trace_parity.py +44 -0
- mtplx-0.1.0rc1/tests/test_ui_progress.py +45 -0
- mtplx-0.1.0rc1/tests/test_validators.py +64 -0
- mtplx-0.1.0rc1/tests/test_vllm_reference.py +55 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
title: MTPLX
|
|
3
|
+
message: "If MTPLX helps your project, product, benchmark, article, or research, please cite it using this metadata."
|
|
4
|
+
type: software
|
|
5
|
+
authors:
|
|
6
|
+
- family-names: Altoukhi
|
|
7
|
+
given-names: Youssof
|
|
8
|
+
repository-code: "https://github.com/youssofal/mtplx"
|
|
9
|
+
url: "https://github.com/youssofal/mtplx"
|
|
10
|
+
license: Apache-2.0
|
|
11
|
+
version: 0.1.0rc1
|
|
12
|
+
abstract: "Native MTP speculative decoding for Qwen3-Next on Apple Silicon, using built-in MTP heads with math-correct rejection sampling and an OpenAI/Anthropic-compatible serving surface."
|
|
13
|
+
keywords:
|
|
14
|
+
- speculative decoding
|
|
15
|
+
- MTP
|
|
16
|
+
- MLX
|
|
17
|
+
- Apple Silicon
|
|
18
|
+
- local AI
|
mtplx-0.1.0rc1/LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity. For the purposes of this definition,
|
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
19
|
+
direction or management of such entity, whether by contract or
|
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
22
|
+
|
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
24
|
+
exercising permissions granted by this License.
|
|
25
|
+
|
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
27
|
+
including but not limited to software source code, documentation
|
|
28
|
+
source, and configuration files.
|
|
29
|
+
|
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
|
31
|
+
transformation or translation of a Source form, including but
|
|
32
|
+
not limited to compiled object code, generated documentation,
|
|
33
|
+
and conversions to other media types.
|
|
34
|
+
|
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
36
|
+
Object form, made available under the License, as indicated by a
|
|
37
|
+
copyright notice that is included in or attached to the work
|
|
38
|
+
(an example is provided in the Appendix below).
|
|
39
|
+
|
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
|
46
|
+
the Work and Derivative Works thereof.
|
|
47
|
+
|
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
|
49
|
+
the original version of the Work and any modifications or additions
|
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
|
61
|
+
|
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
|
64
|
+
subsequently incorporated within the Work.
|
|
65
|
+
|
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
|
72
|
+
|
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
78
|
+
where such license applies only to those patent claims licensable
|
|
79
|
+
by such Contributor that are necessarily infringed by their
|
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
|
82
|
+
institute patent litigation against any entity (including a
|
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
|
85
|
+
or contributory patent infringement, then any patent licenses
|
|
86
|
+
granted to You under this License for that Work shall terminate
|
|
87
|
+
as of the date such litigation is filed.
|
|
88
|
+
|
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
91
|
+
modifications, and in Source or Object form, provided that You
|
|
92
|
+
meet the following conditions:
|
|
93
|
+
|
|
94
|
+
(a) You must give any other recipients of the Work or
|
|
95
|
+
Derivative Works a copy of this License; and
|
|
96
|
+
|
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
|
98
|
+
stating that You changed the files; and
|
|
99
|
+
|
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
|
102
|
+
attribution notices from the Source form of the Work,
|
|
103
|
+
excluding those notices that do not pertain to any part of
|
|
104
|
+
the Derivative Works; and
|
|
105
|
+
|
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
|
108
|
+
include a readable copy of the attribution notices contained
|
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
|
111
|
+
of the following places: within a NOTICE text file distributed
|
|
112
|
+
as part of the Derivative Works; within the Source form or
|
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
|
114
|
+
within a display generated by the Derivative Works, if and
|
|
115
|
+
wherever such third-party notices normally appear. The contents
|
|
116
|
+
of the NOTICE file are for informational purposes only and
|
|
117
|
+
do not modify the License. You may add Your own attribution
|
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
|
120
|
+
that such additional attribution notices cannot be construed
|
|
121
|
+
as modifying the License.
|
|
122
|
+
|
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
|
124
|
+
may provide additional or different license terms and conditions
|
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
|
128
|
+
the conditions stated in this License.
|
|
129
|
+
|
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
|
133
|
+
this License, without any additional terms or conditions.
|
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
|
135
|
+
the terms of any separate license agreement you may have executed
|
|
136
|
+
with Licensor regarding such Contributions.
|
|
137
|
+
|
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
|
140
|
+
except as required for reasonable and customary use in describing the
|
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
|
142
|
+
|
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
|
152
|
+
|
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
|
158
|
+
incidental, or consequential damages of any character arising as a
|
|
159
|
+
result of this License or out of the use or inability to use the
|
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
|
162
|
+
other commercial damages or losses), even if such Contributor
|
|
163
|
+
has been advised of the possibility of such damages.
|
|
164
|
+
|
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
168
|
+
or other liability obligations and/or rights consistent with this
|
|
169
|
+
License. However, in accepting such obligations, You may act only
|
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
|
174
|
+
of your accepting any such warranty or additional liability.
|
|
175
|
+
|
|
176
|
+
END OF TERMS AND CONDITIONS
|
|
177
|
+
|
|
178
|
+
APPENDIX: How to apply the Apache License to your work.
|
|
179
|
+
|
|
180
|
+
To apply the Apache License to your work, attach the following
|
|
181
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
182
|
+
replaced with your own identifying information. (Don't include
|
|
183
|
+
the brackets!) The text should be enclosed in the appropriate
|
|
184
|
+
comment syntax for the file format. We also recommend that a
|
|
185
|
+
file or class name and description of purpose be included on the
|
|
186
|
+
same "printed page" as the copyright notice for easier
|
|
187
|
+
identification within third-party archives.
|
|
188
|
+
|
|
189
|
+
Copyright [yyyy] [name of copyright owner]
|
|
190
|
+
|
|
191
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
192
|
+
you may not use this file except in compliance with the License.
|
|
193
|
+
You may obtain a copy of the License at
|
|
194
|
+
|
|
195
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
196
|
+
|
|
197
|
+
Unless required by applicable law or agreed to in writing, software
|
|
198
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
199
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
200
|
+
See the License for the specific language governing permissions and
|
|
201
|
+
limitations under the License.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include CITATION.cff
|
mtplx-0.1.0rc1/NOTICE
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
MTPLX
|
|
2
|
+
Copyright 2026 Youssof Altoukhi
|
|
3
|
+
|
|
4
|
+
MTPLX is a native MTP speculative decoding project for Apple Silicon.
|
|
5
|
+
|
|
6
|
+
Preferred attribution for public projects, products, benchmarks, articles, and
|
|
7
|
+
research that use or build on MTPLX:
|
|
8
|
+
|
|
9
|
+
Powered by MTPLX by Youssof Altoukhi
|
|
10
|
+
https://github.com/youssofal/mtplx
|
|
11
|
+
|
|
12
|
+
If MTPLX informs academic or technical writing, please cite the repository using
|
|
13
|
+
the included CITATION.cff metadata.
|
mtplx-0.1.0rc1/PKG-INFO
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mtplx
|
|
3
|
+
Version: 0.1.0rc1
|
|
4
|
+
Summary: Native MTP speculative decoding for Qwen3-Next on Apple Silicon.
|
|
5
|
+
Author-email: Youssof Altoukhi <business@youssofal.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/youssofal/mtplx
|
|
8
|
+
Project-URL: Documentation, https://github.com/youssofal/mtplx/tree/main/docs
|
|
9
|
+
Project-URL: Issues, https://github.com/youssofal/mtplx/issues
|
|
10
|
+
Project-URL: Citation, https://github.com/youssofal/mtplx/blob/main/CITATION.cff
|
|
11
|
+
Keywords: mlx,apple-silicon,speculative-decoding,qwen,mtp,local-ai
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
License-File: NOTICE
|
|
25
|
+
Requires-Dist: fastapi>=0.136
|
|
26
|
+
Requires-Dist: huggingface-hub>=0.36
|
|
27
|
+
Requires-Dist: mlx<0.32,>=0.31; sys_platform == "darwin" and platform_machine == "arm64"
|
|
28
|
+
Requires-Dist: mlx-lm<0.32,>=0.31; sys_platform == "darwin" and platform_machine == "arm64"
|
|
29
|
+
Requires-Dist: numpy>=2
|
|
30
|
+
Requires-Dist: pydantic>=2
|
|
31
|
+
Requires-Dist: rich>=14
|
|
32
|
+
Requires-Dist: safetensors>=0.6
|
|
33
|
+
Requires-Dist: uvicorn>=0.46
|
|
34
|
+
Provides-Extra: competitors
|
|
35
|
+
Requires-Dist: dflash-mlx==0.1.0; extra == "competitors"
|
|
36
|
+
Provides-Extra: server
|
|
37
|
+
Requires-Dist: fastapi>=0.136; extra == "server"
|
|
38
|
+
Requires-Dist: uvicorn>=0.46; extra == "server"
|
|
39
|
+
Provides-Extra: dev
|
|
40
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
42
|
+
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
43
|
+
Requires-Dist: twine>=5; extra == "dev"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
<div align="center">
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
███╗ ███╗ ████████╗ ██████╗ ██╗ ██╗ ██╗
|
|
50
|
+
████╗ ████║ ╚══██╔══╝ ██╔══██╗ ██║ ╚██╗██╔╝
|
|
51
|
+
██╔████╔██║ ██║ ██████╔╝ ██║ ╚███╔╝
|
|
52
|
+
██║╚██╔╝██║ ██║ ██╔═══╝ ██║ ██╔██╗
|
|
53
|
+
██║ ╚═╝ ██║ ██║ ██║ ███████╗ ██╔╝ ██╗
|
|
54
|
+
╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
# **Native MTP speculative decoding on Apple Silicon**
|
|
58
|
+
|
|
59
|
+
**~2.24× over no-MTP AR at `temp=0.6`** on Qwen3.6-27B · math-correct rejection sampling · MLX-native · zero external drafter
|
|
60
|
+
|
|
61
|
+
<sub>Multiplier is hardware-independent. Absolute tok/s scales with memory bandwidth — current public record on M5 Max: **63.056 / 62.886 tok/s** D3, [`Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed`](https://huggingface.co/Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed).</sub>
|
|
62
|
+
|
|
63
|
+
[](https://github.com/youssofal/mtplx/actions/workflows/ci.yml)
|
|
64
|
+
[](https://www.python.org/)
|
|
65
|
+
[](https://developer.apple.com/metal/)
|
|
66
|
+
[](CHANGELOG.md)
|
|
67
|
+
[](LICENSE)
|
|
68
|
+
|
|
69
|
+
</div>
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
MTPLX runs **the model's own built-in MTP heads** as a speculative drafter, with **exact probability-ratio acceptance + residual correction** — not the greedy-argmax trick most fast-decode tools use at T>0. That means real coding settings (`temperature=0.6`, `top_p=0.95`, `top_k=20`) actually get the speculative speedup *and* keep the target model's distribution.
|
|
74
|
+
|
|
75
|
+
This is **not** DFlash, DDTree, llama-spec, or an external-drafter system. It's a native-MTP runtime built around MLX, Apple Silicon, and a real OpenAI/Anthropic-compatible serving surface.
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
python3 -m pip install -U mtplx
|
|
79
|
+
mtplx start # interactive: pick model → mode → web/CLI, then chat
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
That's it. The wizard handles the default speed model (`Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed`), runtime mode, and surface (browser chat at `127.0.0.1:8000/` or terminal chat) on first run. On every subsequent run it asks "same as last time?" so you're one keypress from chatting.
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## What you get
|
|
87
|
+
|
|
88
|
+
- **Native MTP speculative decoding.** Built-in MTP heads, no external drafter, no RAM hit for a second model.
|
|
89
|
+
- **Math-correct sampling at T=0.6.** Probability-ratio acceptance with residual correction. Verified `max_diff = 0.0` against reference single-token AR on the verified Qwen3.6-27B path.
|
|
90
|
+
- **~2.24× over no-MTP AR at `temp=0.6`.** The hardware-independent number, which the CLI reports as `mean_speedup_vs_ar`. Verified contract on the public default `Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed`: `63.056 / 62.886 tok/s` MTP-D3 vs `28.156 tok/s` no-MTP AR, on Apple Silicon M5 Max with `--max` fans, target sampler `temp=0.6 top_p=0.95 top_k=20`, draft sampler `temp=0.70`. Absolute tok/s scales with memory bandwidth; the 2.24× multiplier doesn't.
|
|
91
|
+
- **Real serving surface.** OpenAI-compatible `/v1/chat/completions` + `/v1/completions` + `/v1/models`, Anthropic-compatible `/v1/messages` (streaming SSE), `/health`, `/metrics`. Plug it into Open WebUI, Claude Code, Cline, Continue, or anything that speaks OpenAI.
|
|
92
|
+
- **In-browser chat UI** with auto-detected model context (256k for Qwen3.6), live tokens-per-second, markdown rendering, code-block copy buttons, a stop button, and a settings sidebar that persists per-machine.
|
|
93
|
+
- **Interactive start wizard.** Pick model, mode, and surface in three numbered prompts. Returning users get "same as last time?". No flag-soup required.
|
|
94
|
+
- **Local-folder model picker.** Point the wizard at any parent directory — your `~/models/`, the LM Studio cache, the HuggingFace cache — and it walks the tree, classifies each model into the four-tier compatibility contract, and presents a numbered picker. Config-only classification, never mmaps a tensor file, so a single APFS-dataless or partial download in the tree can't crash the picker.
|
|
95
|
+
- **One-line live download progress.** Single rich-rendered line with bar / percent / GB / speed / ETA, streamed at 8 fps. HuggingFace's tqdm bars are suppressed during the download so they don't fight the MTPLX UI for terminal real estate.
|
|
96
|
+
- **Honest profile names that tell you what they do.**
|
|
97
|
+
- `Medium` — default native-MTP speed path (`performance-cold`), in the **~2.2× over no-MTP AR** lane, not sustained without fan control.
|
|
98
|
+
- `Max` — Medium + ThermalForge fans pinned at 100%, **~2.24× over no-MTP AR** in the recorded lane (`63.056 / 62.886 tok/s` MTP vs `28.156 tok/s` AR on M5 Max), loud by design.
|
|
99
|
+
- `Stable` — hidden compatibility flag (`--profile stable` / `--profile safe`) for the exact/staged long-reply path.
|
|
100
|
+
- **Crash-safe fan control.** When Max is on, MTPLX spawns a detached watchdog that restores fans to auto if the parent dies for any reason — including `kill -9` and "I closed the terminal". Verified live on hardware.
|
|
101
|
+
- **Idle-aware Max mode.** Server tracks request activity; after 15 minutes of no chat, fans drop to auto, then ramp back up on the next message.
|
|
102
|
+
- **Four-tier model compatibility contract.** `mtplx inspect <model>` reports: verified / arch-compatible-unverified / incompatible-architecture / no-MTP. No silent garbage runs.
|
|
103
|
+
- **Lazy imports.** `mtplx --help`, `doctor`, `inspect`, `init`, `setup` work on a fresh venv *without MLX installed*. Generation and serving pull in MLX only when needed.
|
|
104
|
+
- **Preview status: 562-test suite green**, including end-to-end onboarding, local-folder picker, live download progress, fan-control crash safety, OpenAI server fake-state, lazy-import survival, exactness gates.
|
|
105
|
+
|
|
106
|
+
> **Preview honesty.** The cold path is verified at the **~2.24× multiplier** above. *Sustained* no-fan long-context throughput is currently in a worse lane on Flappy 10k versus the v0.2 target — the v0.1 release ships with this gap explicit. Closing it is the v0.2 deliverable; see [Roadmap](#roadmap).
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Quick start (full)
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
# 1. Install from PyPI
|
|
114
|
+
python3 -m pip install -U mtplx
|
|
115
|
+
|
|
116
|
+
# 2. Verify the install
|
|
117
|
+
mtplx help
|
|
118
|
+
mtplx doctor --json
|
|
119
|
+
|
|
120
|
+
# 3. Chat (the wizard does everything)
|
|
121
|
+
mtplx start
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Power-user shortcuts (any of these skip the wizard):
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
mtplx start --fresh # re-run the wizard from scratch
|
|
128
|
+
mtplx start cli # terminal chat directly
|
|
129
|
+
mtplx start --max # browser chat with fan boost
|
|
130
|
+
mtplx start --model /path/to/model # use a specific local or HF model
|
|
131
|
+
mtplx pull Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed
|
|
132
|
+
mtplx quickstart --port 8000 # API server only, no chat
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
OpenAI-compatible smoke test:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
curl http://127.0.0.1:8000/v1/chat/completions \
|
|
139
|
+
-H 'Content-Type: application/json' \
|
|
140
|
+
-d '{"model":"mtplx","messages":[{"role":"user","content":"hi"}],"stream":true}'
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The GitHub release wheel remains available for reproducible Preview 1 installs, but PyPI is the primary public path. If your Python blocks global installs, create and activate a virtual environment first:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
python3 -m venv .venv
|
|
147
|
+
. .venv/bin/activate
|
|
148
|
+
python -m pip install -U pip mtplx
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## How it actually works
|
|
154
|
+
|
|
155
|
+
Most "fast decode on Apple Silicon" projects fall into one of three buckets:
|
|
156
|
+
|
|
157
|
+
| Approach | What they do at T>0 | What MTPLX does |
|
|
158
|
+
|---|---|---|
|
|
159
|
+
| llama.cpp / mlx-lm AR | No speculation, target model only | Speculative with a built-in drafter |
|
|
160
|
+
| DFlash, prefix-match speculation | Greedy-argmax equality (silently breaks at T>0) | Probability-ratio acceptance + residual correction |
|
|
161
|
+
| External-drafter speculation | Loads a second model into RAM | Uses the target's own MTP heads — zero extra RAM |
|
|
162
|
+
|
|
163
|
+
The math-correctness wedge is real. At `temperature=0.6`, the difference between "rejected because the draft argmax disagrees" and "rejected via the Leviathan/Chen rejection-sampling theorem" is the difference between a benchmark trick and a runtime your code editor can trust. MTPLX does the latter, including residual correction `(p − q)+` for the cases where the draft was rejected.
|
|
164
|
+
|
|
165
|
+
**Verified evidence (current public default `Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed`):**
|
|
166
|
+
- **~2.24× over matched no-MTP AR at `temp=0.6`** on Apple Silicon M5 Max: `63.056 / 62.886 tok/s` MTP-D3 paired runs vs `28.156 tok/s` no-MTP AR, same machine, same target sampler (`temp=0.6 top_p=0.95 top_k=20`), draft sampler `temp=0.70 top_p=0.95 top_k=20`, performance-cold profile, fans pinned by `--max`, thinking mode off. Recorded in `mtplx_runtime.json` under the model.
|
|
167
|
+
- The multiplier is what the CLI reports as `mean_speedup_vs_ar`. Absolute tok/s above is M5-Max-with-614-GB/s-bandwidth-specific; if your Mac is slower you keep the **2.24×** ratio, the absolute number drops with bandwidth.
|
|
168
|
+
- Per-position acceptance on the recorded prompt: `[100%, 97.96%, 93.88%]` at D3 (corrections=3 over 49 verify calls).
|
|
169
|
+
- Distribution exactness vs reference single-token AR: `max_diff = 0.0`. Greedy diagnostic on the same cleaned window: `60.108 tok/s`.
|
|
170
|
+
|
|
171
|
+
```mermaid
|
|
172
|
+
flowchart LR
|
|
173
|
+
A[Prompt] --> B[Target model<br/>Qwen3.6-27B]
|
|
174
|
+
B --> C[Built-in MTP heads<br/>draft K=4]
|
|
175
|
+
C --> D[Probability-ratio<br/>acceptance + residual correction]
|
|
176
|
+
D --> E[Verified tokens]
|
|
177
|
+
E -->|loop| B
|
|
178
|
+
F[OpenAI-compatible server<br/>Anthropic-compatible /v1/messages]
|
|
179
|
+
E --> F
|
|
180
|
+
G[Browser chat<br/>or terminal chat]
|
|
181
|
+
F --> G
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
No second model, no greedy hack, no external drafter, no silent distribution drift.
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Modes
|
|
189
|
+
|
|
190
|
+
Picked by `mtplx start`, or set explicitly via `--profile`. Every mode preserves exactness; the difference is the runtime path and whether MTPLX touches your fans.
|
|
191
|
+
|
|
192
|
+
| Mode | Profile | Mechanics | Speed lane | Best for |
|
|
193
|
+
|---|---|---|---|---|
|
|
194
|
+
| **Medium** | `performance-cold` | Native-MTP speed path, Apple fan curve | ~2.2× over no-MTP AR, not sustained without fans | Default first run, short replies, snappy chat |
|
|
195
|
+
| **Max** | `performance-cold` + `--max` | Medium path plus ThermalForge pinned to 100% | **~2.24× over no-MTP AR** (recorded: 63.056/62.886 vs 28.156 tok/s on M5 Max) | Sustained workloads, you don't mind fans |
|
|
196
|
+
| **Stable** | `stable` / `safe` | Exact/staged long-reply path, hidden from onboarding | Lower peak speed, steadier shape | Compatibility and conservative long replies |
|
|
197
|
+
|
|
198
|
+
`Max` requires ThermalForge. `mtplx max --install` installs it from source into `~/.mtplx/bin/thermalforge`, sets up a passwordless sudoers rule scoped to that one binary, and verifies fans actually ramp before declaring success. One sudo prompt, end-to-end. Crash safety covers SIGINT, SIGTERM, SIGHUP, terminal close, and `kill -9` via a detached sidecar process.
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## Compatibility
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
mtplx inspect <model-path-or-hf-repo> --json
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
| Tier | Means | Behavior |
|
|
209
|
+
|---|---|---|
|
|
210
|
+
| **Verified** | Has `mtplx_runtime.json` and passed MTPLX gates | Runs |
|
|
211
|
+
| **Arch-compatible, unverified** | Qwen3-Next MTP markers detected, no runtime contract | Refuses unless `--unsafe-force-unverified` |
|
|
212
|
+
| **Incompatible architecture** | MTP exists but not Qwen3-Next | Clear error, roadmap pointer |
|
|
213
|
+
| **No MTP** | No MTP head detected | Clear error, no garbage runs |
|
|
214
|
+
|
|
215
|
+
v0.1 ships verified Qwen3.6-27B via `Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed`, with public served model id `mtplx-qwen36-27b-optimized-speed`. The compatibility registry already detects DeepSeek V3 / V3.2, GLM-4 MoE / MoE-Lite, MiMo, and MiniMax M2 — unsupported runtime families stay behind explicit compatibility gates rather than silently running.
|
|
216
|
+
|
|
217
|
+
### Support matrix
|
|
218
|
+
|
|
219
|
+
| Area | Preview support |
|
|
220
|
+
|---|---|
|
|
221
|
+
| Mac | Apple Silicon only (`arm64`) |
|
|
222
|
+
| macOS | 14.0+; Sequoia is supported |
|
|
223
|
+
| Python | native arm64 Python 3.10+ |
|
|
224
|
+
| MLX | `python3 -m pip install mlx` in the same native environment |
|
|
225
|
+
| Memory | dynamic preflight; warns below 48 GiB, fails when the selected model/profile estimate exceeds 80% of unified memory |
|
|
226
|
+
| Storage | first download requires `max(model_size * 2.5, model_size + 20 GiB)` free on the model-cache filesystem |
|
|
227
|
+
| Docker/Open WebUI | Docker Desktop current plus previous two macOS major releases |
|
|
228
|
+
|
|
229
|
+
Run `mtplx doctor --summary`, `mtplx doctor --deep --json`, or `mtplx doctor --bundle` before filing a bug. Bundles are redacted by default under `~/.mtplx/reports/`.
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## CLI surface
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
mtplx start # interactive setup, then chat
|
|
237
|
+
mtplx help # detailed help; `mtplx help <command>` for any
|
|
238
|
+
mtplx doctor # install + model + integration health
|
|
239
|
+
mtplx inspect <model> # four-tier compatibility report
|
|
240
|
+
mtplx init # write ~/.mtplx/config.toml
|
|
241
|
+
mtplx setup # download verified model, prepare cache
|
|
242
|
+
mtplx pull # download the default HF model safely
|
|
243
|
+
mtplx models # cached models, validation, size, delete command
|
|
244
|
+
mtplx run "..." # one-shot ask
|
|
245
|
+
mtplx chat # terminal chat
|
|
246
|
+
mtplx start # OpenAI/Anthropic-compatible server
|
|
247
|
+
mtplx connect openwebui # paste settings for Open WebUI
|
|
248
|
+
mtplx openwebui docker-command
|
|
249
|
+
mtplx bench run --suite cold-long-code-192
|
|
250
|
+
mtplx max --install # install ThermalForge for Max mode
|
|
251
|
+
mtplx max --status # fan / thermal state
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Every command has `--json` for machine-readable output and `--help` for context-specific docs.
|
|
255
|
+
|
|
256
|
+
---
|
|
257
|
+
|
|
258
|
+
## Architecture
|
|
259
|
+
|
|
260
|
+
The architectural achievement is **a single-model native-MTP runtime that's mathematically exact at temperature**, with a real serving surface bolted on. There is no second drafter, no greedy hack, and no "drop in a fast-decode library" wrapper. Four layers, drawn the way they actually run.
|
|
261
|
+
|
|
262
|
+
### 0. MLX runtime layer (the kernel stack we own)
|
|
263
|
+
|
|
264
|
+
MTPLX is not a thin wrapper over stock MLX — the speed lane sits on top of an **MLX source fork** plus a small set of **custom Metal kernels** registered as primitives. Stock `mlx-lm` cannot reproduce the multiplier above; the runtime layer is what makes the speculative cycle in §2 tractable on Apple Silicon.
|
|
265
|
+
|
|
266
|
+
What we changed at the MLX source level (fork: `mlx-mtplx-0.31.2-qmm`, commit `2377a99f` "Tune small-M qmv for MTPLX 60TPS path"):
|
|
267
|
+
|
|
268
|
+
- **Small-M `qmv` retuning.** The verify forward is dominated by quantized-matrix-vector ops at `M ≈ 3..6` (one position per accepted draft). Stock MLX's `qmv_fast_impl` is tuned for large M and stalls dispatch at small M. Our fork: `BN16` group-size, **4-simdgroup** instead of 2-simdgroup, `unroll_count(4)` on the inner loop. Cuts the verify-MLP region by enough to be the difference between "MTP loses to AR" and "MTP at ~2.24×".
|
|
269
|
+
- **Source-primitive registration.** Custom kernels (below) are registered through `mlx.core.fast.metal_kernel` and integrated into MLX's graph the same way stock primitives are, so `mx.compile` can fuse around them and `mx.eval` doesn't see them as opaque blocks.
|
|
270
|
+
|
|
271
|
+
Custom Metal kernels we shipped on top of the fork:
|
|
272
|
+
|
|
273
|
+
- **`linear-gdn-from-conv-tape`** — the GDN linear-attention path during verify. Records an *innovation tape* of `(token, gate, state-delta)` tuples during the draft phase, then **replays** them deterministically on rollback when a draft is rejected. Replaces stock MLX's `Conv1d` + recurrent-state restore with a single fused kernel that's bit-exact (`max_diff = 0.0` against batched-vs-sequential reference) and shape-stable.
|
|
274
|
+
- **`verify_qmv` (small-M qmv kernel).** Direct successor of dflash-mlx's M=16 idea, retuned for MTPLX's M=3..6 verify shapes. Now subsumed by the MLX-source qmv tuning above for the verify hot path; remains as a standalone primitive for diagnostic regressions.
|
|
275
|
+
- **GraphBank.** A cache of `mx.compile`-compiled verify graphs, keyed by `(suffix_length, depth, profile)`. Each verify shape gets one compiled graph reused across cycles — no per-cycle Python dispatch overhead. Capture-commit + GraphBank together hit `capture_commit_time_s ≈ 0.073 ms` per cycle (vs `verify_time_s ≈ 47 ms` per cycle), i.e. the commit step is three orders of magnitude smaller than the verify itself.
|
|
276
|
+
- **Draft-only 4-bit / 3-bit LM head** built in memory by `scripts/probe_draft_lm_head_requant.py`. The target's `lm_head` stays at the model's actual precision (BF16 / INT4 affine); the drafter gets a separate, much smaller LM-head requantized for proposal use only. Cuts draft time by ~29% without touching target accuracy.
|
|
277
|
+
|
|
278
|
+
Runtime knobs that ship on by default in `performance-cold`:
|
|
279
|
+
|
|
280
|
+
- `MTPLX_LAZY_VERIFY_LOGITS=1` · `MTPLX_BATCH_TARGET_ARRAYS=1` · `MTPLX_LAZY_MTP_HISTORY_APPEND=1` · `MTPLX_DROP_EVENTS=1` · `MTPLX_SKIP_VERIFY_SNAPSHOT=1`.
|
|
281
|
+
|
|
282
|
+
Numerical hygiene (these are correctness fixes, not speed):
|
|
283
|
+
|
|
284
|
+
- **`fp32` `p/q` ratio** during probability-ratio acceptance. The Leviathan–Chen ratio underflows in BF16 at small `q`; fp32 is the only safe path.
|
|
285
|
+
- **`mx.random.split` per draft position** so each acceptance roll uses an independent RNG key. Without this, depth>1 would silently correlate accept decisions.
|
|
286
|
+
|
|
287
|
+
```mermaid
|
|
288
|
+
flowchart TB
|
|
289
|
+
subgraph FORK["MLX source fork · mlx-mtplx-0.31.2-qmm"]
|
|
290
|
+
QMV["small-M qmv: BN16 · 4-simdgroup · unroll_count(4)<br/>tuned for M=3..6 verify shapes"]
|
|
291
|
+
REG["mx.fast.metal_kernel + source-primitive registration<br/>(mx.compile fuses across our kernels)"]
|
|
292
|
+
end
|
|
293
|
+
subgraph KERNS["Custom Metal kernels"]
|
|
294
|
+
TAPE["linear-gdn-from-conv-tape<br/>fused GDN verify + innovation-tape rollback"]
|
|
295
|
+
VQMV["verify_qmv · small-M qmv (diagnostic)"]
|
|
296
|
+
DLM["Draft-only 4/3-bit LM head<br/>built in memory, target lm_head untouched"]
|
|
297
|
+
end
|
|
298
|
+
subgraph GRAPH["Compiled graphs"]
|
|
299
|
+
BANK2["GraphBank · mx.compile per (suffix_len, depth, profile)<br/>capture_commit_time ≈ 0.073 ms / cycle"]
|
|
300
|
+
end
|
|
301
|
+
subgraph HYGIENE["Numerical hygiene"]
|
|
302
|
+
FP32["fp32 p/q ratio (BF16 underflow at small q)"]
|
|
303
|
+
RNG["mx.random.split per draft position"]
|
|
304
|
+
end
|
|
305
|
+
FORK --> KERNS
|
|
306
|
+
KERNS --> GRAPH
|
|
307
|
+
GRAPH --> HOT["used by speculative cycle in §2"]
|
|
308
|
+
HYGIENE --> HOT
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### 1. Single-model runtime
|
|
312
|
+
|
|
313
|
+
The target model and the drafter are the **same checkpoint**. Qwen3.6-27B ships native MTP heads; MTPLX uses them as the speculative drafter. Zero RAM cost for a second model, zero distillation, zero "we trained a drafter" handoff. The trunk's KV cache obeys a **committed-history contract** (verified against the vLLM CUDA reference at cosine > 0.9998 through D5) so recursive draft depth holds together — that's what lets D2/D3/D4 acceptance reach the 90s instead of collapsing.
|
|
314
|
+
|
|
315
|
+
```mermaid
|
|
316
|
+
flowchart LR
|
|
317
|
+
subgraph TGT["Target model · Qwen3.6-27B (single checkpoint)"]
|
|
318
|
+
TRUNK["Trunk · 64 layers (48 GDN + 16 full-attn)<br/>committed-history KV cache"]
|
|
319
|
+
HEAD["Built-in MTP heads · recursive depth K=3 default"]
|
|
320
|
+
TRUNK -.shares hidden states.-> HEAD
|
|
321
|
+
end
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
### 2. Speculative cycle (the hot loop)
|
|
325
|
+
|
|
326
|
+
Per cycle: the MTP head drafts K tokens, the target verifies all K in parallel via one batched forward, **probability-ratio acceptance** (Leviathan–Chen) decides per-position, **residual correction `(p − q)+`** emits a clean replacement on rejection, and a **bonus token** falls out for free when all K accept. Verify cost is paid by `capture_commit` + the `linear-gdn-from-conv-tape` GDN kernel + a **GraphBank** of compiled verify shapes; the math is exact at any temperature.
|
|
327
|
+
|
|
328
|
+
```mermaid
|
|
329
|
+
flowchart LR
|
|
330
|
+
DRAFT["MTP head drafts q₁..q_K<br/>+ proposal probabilities"] --> VERIFY
|
|
331
|
+
VERIFY["Target batched verify forward<br/>capture-commit · linear-gdn-from-conv-tape · GraphBank"] --> ACCEPT
|
|
332
|
+
ACCEPT["Probability-ratio acceptance<br/>(Leviathan–Chen at any T)"] -->|all K accepted| BONUS
|
|
333
|
+
ACCEPT -->|rejected at i| CORR
|
|
334
|
+
BONUS["Bonus token at K+1 (free)"] --> COMMIT
|
|
335
|
+
CORR["Residual (p − q)+ token at i"] --> COMMIT
|
|
336
|
+
COMMIT["Committed-history KV writeback"] -->|next cycle| DRAFT
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### 3. Serving stack
|
|
340
|
+
|
|
341
|
+
The runtime is wrapped in a real serving surface so you can point Open WebUI / Claude Code / Cline / Continue / `curl` / `openai-python` / `anthropic-python` at it. **Engine sessions** keep per-chat state; the **Session Bank** preserves warm-prefix exact state across turns (verified `logits_max_abs_diff = 0.0` against fresh forwards) so multi-turn TTFT doesn't collapse the way a stateless shim would.
|
|
342
|
+
|
|
343
|
+
```mermaid
|
|
344
|
+
flowchart TB
|
|
345
|
+
subgraph CLIENTS["Clients"]
|
|
346
|
+
BR["Browser chat<br/>127.0.0.1:8000"]
|
|
347
|
+
OW["Open WebUI · Cline · Claude Code · Continue"]
|
|
348
|
+
CURL["curl · openai-python · anthropic-python"]
|
|
349
|
+
TERM["Terminal chat (mtplx start cli)"]
|
|
350
|
+
end
|
|
351
|
+
subgraph API["FastAPI server"]
|
|
352
|
+
OAI["/v1/chat/completions · /v1/completions · /v1/models"]
|
|
353
|
+
ANT["/v1/messages (Anthropic SSE translator)"]
|
|
354
|
+
OBS["/health · /metrics"]
|
|
355
|
+
end
|
|
356
|
+
subgraph ENG["Engine layer"]
|
|
357
|
+
SESS["Engine sessions (per-chat context + cache)"]
|
|
358
|
+
BANK["Session bank · warm-prefix exact-state reuse"]
|
|
359
|
+
end
|
|
360
|
+
BR --> OAI
|
|
361
|
+
OW --> OAI
|
|
362
|
+
OW --> ANT
|
|
363
|
+
CURL --> OAI
|
|
364
|
+
CURL --> ANT
|
|
365
|
+
TERM --> ENG
|
|
366
|
+
OAI --> SESS
|
|
367
|
+
ANT --> SESS
|
|
368
|
+
SESS --> BANK
|
|
369
|
+
BANK -->|drives| RUNTIME["Native-MTP runtime (cycle above)"]
|
|
370
|
+
OBS --- ENG
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
The CLI (`mtplx start` / `pull` / `doctor` / `inspect` / `max`) is the on-ramp to all of the above and not the architectural story — it lazy-imports MLX so `--help`, `doctor`, `inspect`, `init`, `setup` work on a fresh venv with no GPU/Apple-Silicon stack installed.
|
|
374
|
+
|
|
375
|
+
---
|
|
376
|
+
|
|
377
|
+
## Roadmap
|
|
378
|
+
|
|
379
|
+
**v0.1.0-preview.1 (today).** Verified Qwen3-Next-MTP cold path, OpenAI/Anthropic-compatible serving, in-browser chat, interactive `mtplx start` wizard with local-folder model picker and one-line live download progress, four-tier compatibility, crash-safe Max mode, lazy-import CLI surface, 562-test suite green.
|
|
380
|
+
|
|
381
|
+
**v0.2 — sustained throughput.** Diagnostic-gated kernel ladder targeting `last64/first64 ≥ 0.90` no-fan on 10k generations while preserving the **~2.24× multiplier** lane. Mechanism-driven: lazy-graph severance + output narrowing if graph history is the bottleneck; MLX-primitive-registered cache-update + `mx.compile` if dispatch tax dominates; an owned GDN+MLP verify-cycle kernel via `mx.fast.metal_kernel` only if the cheaper paths don't close the gap.
|
|
382
|
+
|
|
383
|
+
**v0.3 — broader fleet.** DeepSeek V3 / V3.2 MTP backend (registered, runtime pending), GLM-4 MoE backend, MiMo backend, generic MTP backend behind `mtplx_runtime.json`. Optional Homebrew tap. Multi-session server concurrency.
|
|
384
|
+
|
|
385
|
+
The kernel-ladder direction is grounded in a six-agent deep-research synthesis (Compass / GPT Pro / Gemini ×2 / Claude ×2 / final validation pass) plus a closed-branch failure ledger that's already 35+ entries deep. We don't ship benchmark theater.
|
|
386
|
+
|
|
387
|
+
---
|
|
388
|
+
|
|
389
|
+
## What MTPLX is *not*
|
|
390
|
+
|
|
391
|
+
- It's not DFlash. DFlash uses greedy-argmax prefix matching and breaks the target distribution at T>0. MTPLX implements exact probability-ratio rejection sampling.
|
|
392
|
+
- It's not an external-drafter system. There's no second model. The drafter is the target's own MTP heads.
|
|
393
|
+
- It's not a generic "speculative decoding library". It's a runtime + serving stack with an explicit model-compatibility contract.
|
|
394
|
+
- It's not a CUDA project. MTPLX is MLX-native and Apple-Silicon-first. Linux/CUDA is not on the roadmap; for that, use vLLM.
|
|
395
|
+
- It's not finished. v0.1 is a preview. The **~2.24× multiplier** cold-lane target is met, the sustained-no-fan target is not, and the README says so.
|
|
396
|
+
|
|
397
|
+
---
|
|
398
|
+
|
|
399
|
+
## License, citation, and attribution
|
|
400
|
+
|
|
401
|
+
MTPLX builds on [MLX](https://github.com/ml-explore/mlx) and the Qwen3-Next model family. The speculative-sampling math follows Leviathan & Chen 2023 ("Fast Inference from Transformers via Speculative Decoding") and the MTP heads ship with Qwen. Design and diagnostics are informed by vLLM speculative decoding, vLLM-Metal (issues #188 and #281), DFlash-MLX, DDTree-MLX, and DeepSeek V3.2's `mx.depends` precedent. Optional fan control via [ThermalForge](https://github.com/ProducerGuy/ThermalForge). Model weights and licenses remain governed by their upstream model cards.
|
|
402
|
+
|
|
403
|
+
MTPLX is released under the [Apache License 2.0](LICENSE). If you redistribute MTPLX or derivative works, preserve the Apache license and the attribution notices from [NOTICE](NOTICE) as required by Apache-2.0.
|
|
404
|
+
|
|
405
|
+
If MTPLX powers a public project, product, benchmark, article, or research result, please include clear credit in your README, docs, paper, or public writeup:
|
|
406
|
+
|
|
407
|
+
> Powered by MTPLX by Youssof Altoukhi
|
|
408
|
+
>
|
|
409
|
+
> https://github.com/youssofal/mtplx
|
|
410
|
+
|
|
411
|
+
For academic or technical writing, cite the repository using [CITATION.cff](CITATION.cff).
|
|
412
|
+
|
|
413
|
+
— Built by [Youssof Altoukhi](https://github.com/youssofal). Contributions, bug reports, and benchmark replications welcome via [Issues](https://github.com/youssofal/mtplx/issues).
|