flash-head 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flash_head-0.1.0/LICENSE +117 -0
- flash_head-0.1.0/PKG-INFO +7 -0
- flash_head-0.1.0/README.md +240 -0
- flash_head-0.1.0/pyproject.toml +21 -0
- flash_head-0.1.0/setup.cfg +4 -0
- flash_head-0.1.0/src/flash_head/__init__.py +71 -0
- flash_head-0.1.0/src/flash_head/_version.py +5 -0
- flash_head-0.1.0/src/flash_head/flash_head.py +356 -0
- flash_head-0.1.0/src/flash_head/loading.py +208 -0
- flash_head-0.1.0/src/flash_head/patches/__init__.py +39 -0
- flash_head-0.1.0/src/flash_head/patches/eagle.py +41 -0
- flash_head-0.1.0/src/flash_head/patches/gpu_model_runner.py +37 -0
- flash_head-0.1.0/src/flash_head/patches/llm.py +67 -0
- flash_head-0.1.0/src/flash_head/patches/logits_processor.py +43 -0
- flash_head-0.1.0/src/flash_head/patches/rejection_sampler.py +85 -0
- flash_head-0.1.0/src/flash_head/patches/sampler.py +32 -0
- flash_head-0.1.0/src/flash_head.egg-info/PKG-INFO +7 -0
- flash_head-0.1.0/src/flash_head.egg-info/SOURCES.txt +20 -0
- flash_head-0.1.0/src/flash_head.egg-info/dependency_links.txt +1 -0
- flash_head-0.1.0/src/flash_head.egg-info/entry_points.txt +2 -0
- flash_head-0.1.0/src/flash_head.egg-info/top_level.txt +1 -0
- flash_head-0.1.0/tests/test_compare.py +194 -0
flash_head-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
EMBEDL MODELS COMMUNITY LICENSE AGREEMENT
|
|
2
|
+
v.1.0
|
|
3
|
+
|
|
4
|
+
Embedl is pleased to make available Embedl-optimized AI models and accompanying tools, SDKs, and source code files under this Community License Agreement (the "License" or the "Terms"). Our goal is to foster a vibrant community of developers, researchers, and organizations who can freely use, modify, and build upon these optimized AI models and tools. To that end, this license permits unlimited use, distribution and commercialization for most individuals and organizations, subject only to the limited restrictions described below.
|
|
5
|
+
|
|
6
|
+
To sustain our ability to continue developing and releasing optimized AI models, we have established three limitations on use. First, competitors to Embedl may not use the licensed materials, as permitting such use would undermine our ability to invest in future development. Second, larger organizations—those with more than 250 employees or annual revenue exceeding €10 million—can use the licensed materials for internal evaluation purposes, but must obtain a separate Commercial Use license before using the licensed materials for commercial purposes; this ensures that entities with greater resources contribute appropriately to the ecosystem. Third, no licensee may host the licensed materials as a service, which protects both Embedl's interests and the integrity of the community by preventing commoditization of these models (although specific use cases are possible with prior approval from us).
|
|
7
|
+
|
|
8
|
+
We recognize that individual circumstances vary, and we welcome inquiries from those who wish to discuss alternative licensing arrangements that fall outside these terms. Please contact Embedl at legal@embedl.com to discuss your specific use case. We would be happy to work with you.
|
|
9
|
+
|
|
10
|
+
1.0 DEFINED TERMS
|
|
11
|
+
|
|
12
|
+
"Commercial Use": (a) any provision of the functionality of the Embedl Models, in whole or in part, in any product, offering, consultancy, or as a service; (b) any Distribution; (c) any use of the Embedl Models in whole or in part for direct or indirect strategic or commercial gain or advantage; or (d) any use the Embedl Models or any of its outputs or results, in whole or in part, for purposes of training or improving any other large language model.
|
|
13
|
+
|
|
14
|
+
"Competitor": any person or entity in the business of selling access to AI development and optimization tools.
|
|
15
|
+
|
|
16
|
+
"Contribution": any contribution to the Embedl Models that is intentionally submitted for inclusion in the Embedl Models by the contribution owner (copyright and otherwise), or by a person or entity authorized to submit the contribution on behalf of such owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to us, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems, but excluding communication that is conspicuously marked or otherwise designated in writing by the contribution owner as "Not a Contribution."
|
|
17
|
+
|
|
18
|
+
"Contributor": any person or entity submitting a Contribution that has been accepted by us and incorporated within the Embedl Models.
|
|
19
|
+
|
|
20
|
+
"Distribution" or "Distribute": means any transmission, distribution, deployment, publication, or other sharing of the Embedl Models in whole or in part, whether standalone or as incorporated or bundled with other works, services, or products, to any third party, including by providing or making the Embedl Models or its functionality available as a hosted service via API, web access, or any other electronic or remote means.
|
|
21
|
+
|
|
22
|
+
"Embedl Models": the AI models, software, algorithms, code, and weights, licensed by us under these Terms, as any of the foregoing may be updated or modified by us on one or more occasions.
|
|
23
|
+
|
|
24
|
+
"Third Party Terms": terms and policies applicable to Llama, Gemma, and Qwen, including terms currently available at https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/LICENSE, https://ai.google.dev/gemma/terms, and https://www.apache.org/licenses/LICENSE-2.0 (for Qwen materials), as well as the policies set forth in section 3.9.
|
|
25
|
+
|
|
26
|
+
"We," "us," "our," or "Embedl": Embedl AB, organized under the laws of Sweden.
|
|
27
|
+
|
|
28
|
+
"You" or "your": the person, and any entity such person works for or on behalf of, exercising the rights of this license, excluding any Competitor.
|
|
29
|
+
|
|
30
|
+
2.0 LICENSE RIGHTS
|
|
31
|
+
|
|
32
|
+
2.1 License Grant. Subject to your compliance with section 3.0, including the Commercial Use and Competitor limitations below, and the Third Party Terms, we hereby grant to you a worldwide, royalty-free, sublicensable, non-transferable, and non-exclusive copyright license to use, reproduce, distribute, display, perform, modify, and create derivative works of the Embedl Models in whole or in part.
|
|
33
|
+
|
|
34
|
+
2.2 Contributor License Grant. Subject to your compliance with these Terms, including the Commercial Use and Competitor limitations below, and the Third Party Terms, each Contributor hereby grants to you a worldwide, non-exclusive, royalty-free, sublicensable, and non-transferable (a) copyright license to use, reproduce, distribute, display, perform, modify, and create derivative works of the Embedl Models in whole or in part; and (b) patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Embedl Models in whole or in part, but only to the extent of patent claims licensable by such Contributor that are necessarily infringed by the exercise of such rights, including for standalone Contributions and Contributions in combination with the Embedl Models.
|
|
35
|
+
|
|
36
|
+
2.3 Reservation of Rights. All rights not expressly granted in this section are reserved to us. You will have no right or license to the Embedl Models other than the rights set forth in this section 2.0, and no other right or license will be implied, by estoppel, conduct, or otherwise. If you are an entity, you may exercise the rights granted in this section solely via your own employees and contractors. We and our licensors retain all right, title and interest in the Embedl Models and associated intellectual property rights, and all copies of the Embedl Models in whole or in part, and we grant no rights to our patents.
|
|
37
|
+
|
|
38
|
+
3.0 LICENSE CONDITIONS
|
|
39
|
+
|
|
40
|
+
3.1 No Use by Competitors. If you are, or your employer or client for whom you work is, a Competitor, you are not licensed to download, license, install or use the Embedl Models. You represent and warrant that you are not a Competitor and are not working for a Competitor. If you are in breach of this warranty, now or in the future, you are not licensed to use the Embedl Models; doing so anyway constitutes breach of these Terms and infringement of our intellectual property rights.
|
|
41
|
+
|
|
42
|
+
3.2 Commercial Use Limitation.
|
|
43
|
+
|
|
44
|
+
a) The licensed rights set forth in section 2.0 exclude Commercial Use if you are an entity, or an affiliate or member of a company group, that has more than 250 employees or annual revenue of €10,000,000 or more ("Large Entity"). If you are or become a Large Entity or part of a group of one, Commercial Use is not or is no longer licensed to you under these Terms; you must contact us for a Commercial Use license. You represent and warrant that you are not a Large Entity and are not working for a Large Entity. If you are in breach of this warranty, now or in the future, you are not licensed to use the Embedl Models for Commercial Use; doing so anyway constitutes breach of these Terms and infringement of our intellectual property rights.
|
|
45
|
+
|
|
46
|
+
b) In addition to any other remedies, if you violate section 3.2(a), you hereby grant to Embedl on behalf of yourself and each of your affiliates, employees and contractors, a non-exclusive, worldwide, perpetual, irrevocable, non-terminable, sublicensable (through multiple tiers), fully paid-up, royalty-free, assignable and transferable intellectual property rights license as to Distributions, to make, use, reproduce, modify, adapt, create derivative works of, improve, extend, enhance, translate, distribute (directly and indirectly, in any medium, under terms of choice), combine, compile, transmit, display and perform publicly, license, rent, lease, and manufacture, sell, offer to sell, and import, your Distributions, for any commercial or non-commercial purpose.
|
|
47
|
+
|
|
48
|
+
3.3 No Use as a Hosted Service. You may not, without our explicit permission, use the Embedl Models, any derivative works thereof, or any models created or trained via the use or incorporation of the Embedl Models, to provide a hosted service, software-as-a-service offering, API, or any other mechanism that allows third parties to access or use the functionality of such materials, or to submit inputs to, receive outputs from, the Embedl Models, whether or not for a fee. Notwithstanding the foregoing, you may deploy the Embedl Models on an internal network accessible solely to your employees and contractors performing work exclusively for you, solely for your internal business purposes.
|
|
49
|
+
|
|
50
|
+
In addition, we are open to hosted use in support of a specific function or feature in a separate application, depending on the context, on a case-by-base basis. Please contact us, and we would be happy to explore that.
|
|
51
|
+
|
|
52
|
+
3.4 Required Notices and Pass-Throughs.
|
|
53
|
+
|
|
54
|
+
a) You will preserve and not suppress proprietary notices, markings, and branding visible in or associated with or displayed via the use of the Embedl Models, and in any Distributions.
|
|
55
|
+
|
|
56
|
+
b) You will give each recipient of your Distribution a copy of these Terms and the following notice: "Optimized Embedl models are licensed under the Embedl Models Community Source License v.1 available at [___], and the terms of Embedl's licensors. Copyright © Embedl AB."
|
|
57
|
+
|
|
58
|
+
c) You must cause any modified files in your Distributions to carry prominent notices explaining the changes that were made.
|
|
59
|
+
|
|
60
|
+
d) You must cause each Distribution to prominently display the following notices, as applicable (depending on which models are selected and included in any Distribution):
|
|
61
|
+
|
|
62
|
+
Llama 3.2 is licensed under the Llama 3.2 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved. Use is subject to applicable terms and policies, including terms currently available at https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/LICENSE
|
|
63
|
+
|
|
64
|
+
Gemma is provided under and subject to the Gemma Terms of Use found at https://ai.google.dev/gemma/terms.
|
|
65
|
+
|
|
66
|
+
Qwen is licensed under the Apache License 2.0, Copyright © Alibaba Cloud. All Rights Reserved.
|
|
67
|
+
|
|
68
|
+
e) For Llama-based Distributions: (a) you must prominently display "Built with Llama" on a related website, user interface, blogpost, about page, or product documentation; and (b) if you use the Embedl Models or any outputs or results of the Embedl Models to create, train, fine tune, or otherwise improve any AI model that is distributed or made available, you shall also include "Llama" at the beginning of any such AI model name.
|
|
69
|
+
|
|
70
|
+
f) You must require each recipient to a Distribution to agree in writing (or through another mechanism establishing enforceable assent) to be bound by the terms and restrictions of this section 3.0 and section 5.2, and to impose the same requirement on any subsequent recipients, so that each recipient in the chain of distribution and/or use is bound by such terms and restrictions.
|
|
71
|
+
|
|
72
|
+
3.5 Contributions. Each Contributor hereby grants to us a worldwide, non-exclusive, royalty-free, sublicensable, and non-transferable (a) copyright license to use, reproduce, distribute, display, perform, modify, and create derivative works of each Contribution; and (b) patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Embedl Models, but only to the extent of patent claims licensable by such Contributor that are necessarily infringed by the exercise of such rights, for standalone Contributions or in Contributions in combination with the Embedl Models. By submitting a Contribution, you represent and warrant that you have sufficient rights to grant the licenses under these Terms, and that doing so does not violate any obligation to a third party.
|
|
73
|
+
|
|
74
|
+
3.6 Our Marks. You agree that any use of our marks, branding and logos, including Embedl Models ("Marks"), whether permitted or otherwise, will inure to our sole benefit. You will not directly or indirectly: (a) file or prepare any application for registration of any Marks; (b) assert any right, title, license to, or interest in the Marks; or (c) adopt, use, file for registration, or register, in whole or in part, any trademark, service mark, trade name, logo, or domain name which may be confusingly similar to or an infringement of the Marks or any of our domain names.
|
|
75
|
+
|
|
76
|
+
3.7 Feedback. You agree that we may freely use and exploit in perpetuity any feedback, requirements, recommendations, ideas, bug fixes, reviews, ratings, comments, suggestions, or improvements, that you, or any employee or agent thereof, may at any time disclose or submit to us relating to the Embedl Models for our business purposes, including for product, services or solution sales, licensing, support and development, without any obligation or payment to you.
|
|
77
|
+
|
|
78
|
+
3.8 Unauthorized Use. You will not use the Embedl Models or participate in any activities via the Embedl Models in a manner that is likely to be prohibited by law or these Terms or violative of third party rights in any applicable jurisdiction, including intellectual property and data privacy rights. Your use of the Embedl Models must be in full compliance with applicable law and the Third Party Terms. You acknowledge that use of the Embedl Models may be subject to the EU AI Act and other AI-specific laws and regulations. You are solely responsible for determining whether your use or Distribution constitutes a high-risk AI system or is otherwise subject to regulatory obligations, and for compliance with all applicable requirements.
|
|
79
|
+
|
|
80
|
+
3.9 Acceptable Use. You hereby accept and shall fully comply with the following Acceptable Use Policies:
|
|
81
|
+
|
|
82
|
+
a) for the Llama-derived materials: https://www.llama.com/llama3_2/use-policy; and
|
|
83
|
+
|
|
84
|
+
b) for the Gemma-derived materials: https://ai.google.dev/gemma/prohibited_use_policy.
|
|
85
|
+
|
|
86
|
+
3.10 Third Party Restrictions. You acknowledge that certain components derived from Gemma may be subject to remote restriction by Google if Google believes usage violates the Gemma Terms of Use. To the maximum extent permitted by law, we are not liable for any such restriction.
|
|
87
|
+
|
|
88
|
+
3.11 Notifications to Us. If you believe that you are entitled or obligated to act contrary to these Terms under any mandatory or applicable law, you agree to provide us with detailed and substantiated explanation of your reasons in writing at least thirty days before you act, to allow us to assess whether we may, at our sole discretion, provide an alternative remedy for the situation, though we are under no obligation to do so. To be effective, notices to us must be sent to legal@embedl.com.
|
|
89
|
+
|
|
90
|
+
4.0 DISCLAIMER OF WARRANTIES; LIMITATION OF LIABILITY
|
|
91
|
+
|
|
92
|
+
4.1 Disclaimer. TO THE MAXIMUM EXTENT POSSIBLE UNDER APPLICABLE LAW, we provide the Embedl Models on an as-is, as-available basis with all faults, and WE DISCLAIM ALL WARRANTIES OF ANY KIND WITH RESPECT TO THE EMBEDL MODELS, WHETHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. Specifically, we make no warranty that (a) the Embedl Models will meet your requirements, goals or needs, or (b) any errors or deficiencies in the Embedl Models will be corrected.
|
|
93
|
+
|
|
94
|
+
4.2 Exclusion. Notwithstanding any other provision of these Terms, we will not be liable for any losses or damages, whether direct, indirect, punitive, special, incidental or consequential damages, or liable for interruption of business, diminution of value, cost of replacement, downtime, or loss of profits, revenue, use, data, or other economic advantage, in connection with, related to or arising out of the Embedl Models or these Terms, regardless of the theory of liability, whether for breach of these Terms, including breach of warranty, or in tort or otherwise, even if we have been previously advised of the possibility of such damages and even if such damages are reasonably foreseeable. Liability for damages will be so limited and excluded, regardless of the validity or efficacy of any remedy provided herein and even if any remedy fails of its essential purpose. Nothing in these Terms excludes or limits liability for death or personal injury caused by negligence, fraud, intentional misconduct, or gross negligence, or any other liability that cannot be excluded under applicable law.
|
|
95
|
+
|
|
96
|
+
4.3 Indemnification. You hereby agree to defend, indemnify, release and hold harmless Embedl, and its officers, directors, employees, consultants, agents, and representatives, in full from and against any and all claims (including third party claims), losses, liability, damages, and/or costs of every kind and nature, arising out of or in any way connected with: (a) your access to or use of, the Embedl Models or conduct engendered thereby, (b) your breach of these Terms or the Third Party Terms, (c) your infringement or misappropriation of any intellectual property right or other right of any person or entity, or (d) your violations of applicable law.
|
|
97
|
+
|
|
98
|
+
5.0 TERMINATION
|
|
99
|
+
|
|
100
|
+
5.1 Upon Breach. The licenses granted by these Terms will terminate automatically if you are in breach of these Terms, including if you exceed the scope of the licenses granted in section 2.0. Breach of these Terms may result in pursuit of all available remedies for intellectual property rights (including intellectual property rights infringement), the availability of which you hereby acknowledge.
|
|
101
|
+
|
|
102
|
+
5.2 Upon Litigation. The license rights set forth in section 2.0 will terminate automatically if you institute litigation (including a cross-claim or counterclaim in a lawsuit) against any entity alleging that the use, distribution, modification, sale, offer to sell, transfer or import of the Embedl Models, any Contribution (alone or in combination), or the outputs or results of the Embedl Models, constitutes direct, indirect, vicarious, or contributory infringement of intellectual property rights.
|
|
103
|
+
|
|
104
|
+
5.3 By Us. We may on written notice terminate rights granted under these Terms in the event the Embedl Models become subject to third party claims of infringement or violation of intellectual property or other rights or applicable law.
|
|
105
|
+
|
|
106
|
+
5.4 Effect. Upon termination of rights for any reason, all licensed rights granted under these Terms will terminate, and you will immediately cease all use, and destroy all and not retain any copies, of the Embedl Models. Our rights under this section 5.0 are in addition to any other rights and remedies permitted by law or under these Terms.
|
|
107
|
+
|
|
108
|
+
6.0 GENERAL PROVISIONS
|
|
109
|
+
|
|
110
|
+
6.1 Governing Law. These Terms and any disputes between us and related to or concerning these Terms or the Embedl Models (including tort as well as contract claims, and whether pre-contractual or extra-contractual), notwithstanding the choice of laws rules of any jurisdiction to the contrary, will be governed by the procedural and substantive laws of Sweden. The United Nations Convention on Contracts for the International Sale of Goods will not apply and is hereby excluded.
|
|
111
|
+
|
|
112
|
+
6.2 Arbitration. Subject to section 6.3, any disputes between or claims brought by either party arising out of or related to the Embedl Models or these Terms shall be referred to and finally settled exclusively by binding arbitration before the Arbitration Institute of the Stockholm Chamber of Commerce (the "SCC"), and the venue for arbitration will be Stockholm, Sweden. The Rules for Expedited Arbitrations will apply, unless the SCC in its discretion determines, taking into account the complexity of the case, the amount in dispute and other circumstances, that the Arbitration Rules will apply. In the latter case, the SCC will also decide whether the Arbitral Tribunal will be composed of one or three arbitrators. The arbitration will be conducted by telephone, on-line and/or based solely upon written submissions where no in-person appearance is required. If in-person appearance is required, such hearings will be held in Stockholm, Sweden. The arbitrator will apply the law specified in section 6.1. All awards may if necessary be enforced by any court having jurisdiction. The existence of any dispute, the existence or details of the arbitration proceeding, and all related documents, materials, evidence, judgments and awards therein, shall be kept confidential. Except as required by law, no party will make any public announcements with respect to the other party or the proceeding or the award, except as required to enforce same. All disputes will be arbitrated only on an individual basis and not in a class, consolidated or representative action. The arbitrator does not have the power to vary these provisions. Subject to section 6.3, all claims between the parties must be resolved via arbitration in accordance with this section. Should either party file an action contrary to this section, the other party may recover lawyers' fees and costs associated with enforcing this section, provided that the party seeking the award has notified the other party in writing of the improperly filed claim, and the other party has failed to withdraw the claim in a timely fashion.
|
|
113
|
+
|
|
114
|
+
6.3 Injunctive Relief; Prevailing Party. You acknowledge and agree that breach of these Terms, or any unauthorized use or distribution of the Embedl Models, may cause irreparable harm to us, the extent of which would be difficult to ascertain, and that we will be entitled to seek immediate injunctive relief and specific performance (in addition to any other available remedies), and to enforce and/or seek redress for infringement of intellectual property rights, in any court of competent jurisdiction under the applicable laws thereto (and such proceeding, and our right to prosecute such a claim, is not subject to arbitration). You hereby consent to the jurisdiction of such courts for any such actions. A party prevailing in any litigation or arbitration related to these Terms or the Embedl Models will be entitled, in addition to such other relief as may be granted, to an award of reasonable attorneys' fees. This section is in addition and without prejudice to the availability of attorneys' fees as a remedy under applicable law, including laws pertaining to intellectual property rights.
|
|
115
|
+
|
|
116
|
+
6.4 Important Final Terms. The Terms constitute the entire agreement between you and us and govern your use of the Embedl Models, superseding any prior agreements, understandings, communications or proposals; except that these Terms do not override, supersede, or modify the terms of any written license agreement signed by you and us. If any provision of the Terms is found by a court of competent jurisdiction to be invalid, the parties nevertheless agree that the court should endeavor to give effect to the parties' intentions as reflected in the provision, and the other provisions of the Terms will remain in full force and effect. No waiver of any provision of these Terms will be deemed a further waiver or continuing waiver or such provision or any other provision, and our failure to assert any right or provision under these Terms will not constitute a waiver of such right or provision. Nothing herein will be deemed to create an agency, partnership, joint venture, or franchisor-franchisee relationship of any kind between us and any user or other person or entity, nor do these terms extend rights to any third party. These Terms are in the English language only, which language will be controlling in all respects, and all versions of these Terms in any other language will be for accommodation only and will not be binding on you or us.
|
|
117
|
+
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# FlashHead
|
|
4
|
+
|
|
5
|
+
### vLLM Plugin for Fast Language Model Head Inference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
The dense classification head accounts for up to 60% of parameters in small LLMs and roughly half of decode-step compute. FlashHead replaces it with a two-stage retrieval pipeline — **up to 2.0x model-level inference speedup** while maintaining accuracy — training-free and hardware-friendly. FlashHead integrates via vLLM's official `vllm.general_plugins` entry point: no source patches, no custom Docker image.
|
|
9
|
+
|
|
10
|
+
<a href="https://python.org/">
|
|
11
|
+
<img alt="Python" src="https://img.shields.io/badge/Python-3.10+-blue.svg?logo=python" />
|
|
12
|
+
</a>
|
|
13
|
+
<a href="https://github.com/vllm-project/vllm">
|
|
14
|
+
<img alt="vLLM" src="https://img.shields.io/badge/vLLM-0.14.0+-green.svg" />
|
|
15
|
+
</a>
|
|
16
|
+
<a href="https://github.com/embedl/flash-head/LICENSE">
|
|
17
|
+
<img alt="License" src="https://img.shields.io/badge/License-Embedl Models Community-red.svg" />
|
|
18
|
+
</a>
|
|
19
|
+
<br>
|
|
20
|
+
<a href="https://arxiv.org/abs/2603.14591">
|
|
21
|
+
<img alt="Paper" src="https://img.shields.io/badge/arXiv-2603.14591-b31b1b.svg?logo=arxiv" />
|
|
22
|
+
</a>
|
|
23
|
+
<a href="https://huggingface.co/collections/embedl/flashhead">
|
|
24
|
+
<img alt="Collection" src="https://img.shields.io/badge/Collection-FlashHead Models-yellow.svg?logo=huggingface" />
|
|
25
|
+
</a>
|
|
26
|
+
<a href="https://huggingface.co/spaces/embedl/Edge-Inference-Benchmarks">
|
|
27
|
+
<img alt="Benchmarks" src="https://img.shields.io/badge/Spaces-Edge Inference Benchmarks-yellow.svg?logo=huggingface" />
|
|
28
|
+
</a>
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
</div>
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## FlashHead: Efficient Drop-In Replacement for the Classification Head in Language Model Inference
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
The standard LM head computes a dense matrix multiplication $h_t × W_{vocab}$ at every decode step, scoring all Vocabulary tokens regardless of relevance. FlashHead reframes this as a two-stage retrieval problem over clustered token embeddings: first identify which regions of vocabulary space are relevant, then score only those candidates.
|
|
39
|
+
|
|
40
|
+
<p align="center">
|
|
41
|
+
<img src="docs/flash_head_flow_diagram.svg" width="75%" />
|
|
42
|
+
</p>
|
|
43
|
+
|
|
44
|
+
> **⚡ Key Tradeoff** A dense head scores **128,256 tokens per step** (for a 128K vocabulary). With *c = 8,016* clusters and *p = 256* probes, FlashHead scores only **8,016 + 256 × 16 = 12,112 tokens**, a <span style="color:#22c55e; font-weight:600;">10× reduction</span> in scored tokens, while multi-probe retrieval maintains near-perfect recall of the correct next token.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
<p align="center" width="100%">
|
|
48
|
+
<img src="docs/dense_head_scoring.svg" width="30%"/>
|
|
49
|
+
<img src="docs/arrow.svg" width="4%"/>
|
|
50
|
+
<img src="docs/flash_head_scoring.svg" width="30%"/>
|
|
51
|
+
</p>
|
|
52
|
+
|
|
53
|
+
<strong>Note.</strong> The offline clustering step runs once per model and adds zero overhead at inference time.
|
|
54
|
+
Both stages use contiguous memory access patterns for GPU and edge accelerator efficiency.
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
### Four key ideas (see [paper](https://arxiv.org/abs/2603.14591))
|
|
58
|
+
|
|
59
|
+
- **Equal-sized Clustering** Token embeddings grouped into **balanced clusters** for predictable memory access and stable latency. Unlike hierarchical softmax, cluster sizes stay uniform; critical for GPU and edge accelerators.
|
|
60
|
+
|
|
61
|
+
- **Multi-Probe Retrieval** Instead of committing to a single cluster, FlashHead probes multiple centroids - beam search over vocabulary space. Near-perfect recall with far fewer evaluations.
|
|
62
|
+
|
|
63
|
+
- **Full Decoding Support** Supports both greedy and sampling decoding. For sampling, clusters are selected proportionally to centroid probabilities, **preserving the output distribution**.
|
|
64
|
+
|
|
65
|
+
- **Selective Quantization** Stage 1 (coarse centroid scoring) runs in low precision; Stage 2 preserves accuracy. The head's quantization weakness becomes a **structural advantage**.
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
## 📦 Installation
|
|
69
|
+
|
|
70
|
+
**Prerequisites:** Python 3.10+ and [vLLM](https://github.com/vllm-project/vllm) >= 0.14.0
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install flash-head
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
That's it. The plugin is discovered automatically by vLLM at startup. ✨
|
|
77
|
+
|
|
78
|
+
<details>
|
|
79
|
+
<summary>Install from source</summary>
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
git clone https://github.com/embedl/flash-head.git
|
|
83
|
+
cd flash-head
|
|
84
|
+
pip install .
|
|
85
|
+
```
|
|
86
|
+
</details>
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
## 🚀 Usage
|
|
90
|
+
|
|
91
|
+
### ⌨️ CLI
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# FlashHead activates automatically for compatible models
|
|
95
|
+
vllm serve embedl/Cosmos-Reason2-2B-W4A16-Edge2-FlashHead \
|
|
96
|
+
--host 0.0.0.0 --port 8000 \
|
|
97
|
+
--gpu-memory-utilization 0.75 \
|
|
98
|
+
--max-model-len 8192
|
|
99
|
+
|
|
100
|
+
# Disable without uninstalling
|
|
101
|
+
FLASHHEAD_ENABLED=0 vllm serve ...
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### 🐍 Python
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from vllm import LLM, SamplingParams
|
|
108
|
+
|
|
109
|
+
llm = LLM(
|
|
110
|
+
model="embedl/Cosmos-Reason2-2B-W4A16-Edge2-FlashHead",
|
|
111
|
+
trust_remote_code=True,
|
|
112
|
+
)
|
|
113
|
+
outputs = llm.generate(
|
|
114
|
+
["Explain quantum computing."],
|
|
115
|
+
SamplingParams(max_tokens=50),
|
|
116
|
+
)
|
|
117
|
+
print(outputs[0].outputs[0].text)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
The model's `config.json` contains `"flash_head_cache_dir": "flash_head_assets"` which signals FlashHead to activate. Standard models without this field are completely unaffected.
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
## 🔧 vLLM Plugin Integration
|
|
124
|
+
|
|
125
|
+
1. **Discovery** vLLM discovers the `flash-head` plugin via the `vllm.general_plugins` entry point at startup
|
|
126
|
+
2. **Patching** `register()` is called in every process, intercepting logits computation, sampling, and speculative decoding
|
|
127
|
+
3. **Inference** The worker lazily constructs the FlashHead module on GPU from the model's clustering cache
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
## 🛡️ Safety
|
|
131
|
+
|
|
132
|
+
FlashHead models use a custom architecture name (e.g., `FlashHeadQwen3VLForConditionalGeneration`). Without the plugin installed, vLLM does not recognize the architecture and refuses to load the model. Users cannot accidentally run at reduced speed.
|
|
133
|
+
|
|
134
|
+
| Scenario | Behavior |
|
|
135
|
+
|-----------------------------------------|-------------------------------------------------|
|
|
136
|
+
| Plugin not installed | ❌ vLLM errors: architecture not supported |
|
|
137
|
+
| Plugin installed, `FLASHHEAD_ENABLED=0` | ⏸️ Clean disable, model loads without FlashHead |
|
|
138
|
+
| Plugin installed, enabled | ✅ FlashHead loads on GPU, full speedup |
|
|
139
|
+
|
|
140
|
+
### 🏗️ Supported Architectures
|
|
141
|
+
|
|
142
|
+
See most recent architectures in [_FLASHHEAD_ARCHITECTURES](https://github.com/embedl/flash-head/src/flash_head/__init__.py):
|
|
143
|
+
```python
|
|
144
|
+
_FLASHHEAD_ARCHITECTURES = {
|
|
145
|
+
"FlashHeadLlamaForCausalLM": "vllm.model_executor.models.llama:LlamaForCausalLM",
|
|
146
|
+
"FlashHeadQwen3ForCausalLM": "vllm.model_executor.models.qwen3:Qwen3ForCausalLM",
|
|
147
|
+
"FlashHeadQwen3VLForConditionalGeneration": "vllm.model_executor.models.qwen3_vl:Qwen3VLForConditionalGeneration",
|
|
148
|
+
"FlashHeadGemma3ForCausalLM": "vllm.model_executor.models.gemma2:Gemma2ForCausalLM",
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
## 📤 Publishing FlashHead Models
|
|
154
|
+
|
|
155
|
+
For the safety check to work, FlashHead models should use this `config.json` structure:
|
|
156
|
+
|
|
157
|
+
```json
|
|
158
|
+
{
|
|
159
|
+
"architectures": ["FlashHeadQwen3VLForConditionalGeneration"],
|
|
160
|
+
"model_type": "qwen3_vl",
|
|
161
|
+
"flash_head_cache_dir": "flash_head_assets"
|
|
162
|
+
}
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
- `architectures` uses the `FlashHead*` prefix so vLLM rejects the model without the plugin
|
|
166
|
+
- `model_type` stays standard so vLLM can resolve the base model class
|
|
167
|
+
- `flash_head_cache_dir` points to the clustering cache directory
|
|
168
|
+
- Do NOT include `auto_map` -- the plugin handles registration
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
## 🗺️ Roadmap
|
|
172
|
+
|
|
173
|
+
- ✅ Core FlashHead plugin for vLLM (greedy decoding)
|
|
174
|
+
- ✅ Balanced clustering with multiprobe retrieval
|
|
175
|
+
- ✅ Inference-time sampling across full vocabulary
|
|
176
|
+
- ✅ Quantized model support
|
|
177
|
+
- 🔄 **Speculative decoding:** Full FlashHead integration with vLLM's speculative decoding pipeline *(in progress)*
|
|
178
|
+
- 🔄 **EAGLE draft proposals:** FlashHead-accelerated draft generation for EAGLE speculative decoding *(in progress)*
|
|
179
|
+
- ⬜ Additional model architectures
|
|
180
|
+
- ⬜ Benchmarks on additional edge platforms (Qualcomm, AMD, Intel, ...)
|
|
181
|
+
|
|
182
|
+
💡 Want a feature? [Open an issue](https://github.com/embedl/flash-head/issues/new)!
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
## 🤝 Contributing
|
|
186
|
+
|
|
187
|
+
We welcome contributions, feedback, and collaboration. Whether you're interested in adding support for new architectures, improving performance, or integrating FlashHead into your own inference stack -- we'd love to hear from you.
|
|
188
|
+
|
|
189
|
+
- **Report issues** Bug reports and feature requests help us improve. [Open an issue](https://github.com/embedl/flash-head/issues/new).
|
|
190
|
+
- **Submit PRs** Code contributions for new architectures, optimizations, or bug fixes.
|
|
191
|
+
- **Research collaboration** Working on efficient inference, vocabulary approximation, or edge deployment? Reach out.
|
|
192
|
+
- **Model contributions** Publish FlashHead-optimized models to the [HuggingFace collection](https://huggingface.co/collections/embedl/flash-head).
|
|
193
|
+
- **Benchmarks** Run FlashHead on your hardware and submit results to the [Edge Inference Benchmarks](https://huggingface.co/spaces/embedl/Edge-Inference-Benchmarks) space.
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
## 📂 Project Structure
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
flash-head/
|
|
200
|
+
├── src/flash_head/
|
|
201
|
+
│ ├── __init__.py # Plugin entry point (register)
|
|
202
|
+
│ ├── flash_head.py # Core clustering-based head
|
|
203
|
+
│ ├── loading.py # Model/asset loading from HF Hub
|
|
204
|
+
│ └── patches/ # vLLM runtime patches
|
|
205
|
+
│
|
|
206
|
+
├── pyproject.toml
|
|
207
|
+
└── LICENSE
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
## 📖 Citation
|
|
212
|
+
|
|
213
|
+
If you use FlashHead in your research, please cite:
|
|
214
|
+
|
|
215
|
+
```bibtex
|
|
216
|
+
@article{tranheden2026flashhead,
|
|
217
|
+
title={FlashHead: Efficient Drop-In Replacement for the Classification Head in Language Model Inference},
|
|
218
|
+
author={Tranheden, Wilhelm and Ahmed, Shahnawaz and Dubhashi, Devdatt and Matthiesen, Jonna and von Essen, Hannes},
|
|
219
|
+
journal={arXiv preprint arXiv:2603.14591},
|
|
220
|
+
year={2026}
|
|
221
|
+
}
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## License
|
|
225
|
+
|
|
226
|
+
Free for non-commercial use within the Embedl Community License (v.1.0).
|
|
227
|
+
|
|
228
|
+
<div align="center">
|
|
229
|
+
|
|
230
|
+
### Interested in FlashHead?
|
|
231
|
+
|
|
232
|
+
Enterprise licensing, custom model optimization, and engineering support available.
|
|
233
|
+
|
|
234
|
+
[models@embedl.com](mailto:models@embedl.com) • [embedl.com](https://embedl.com)
|
|
235
|
+
|
|
236
|
+
<br>
|
|
237
|
+
|
|
238
|
+
<sub>© 2026 Embedl AB. All rights reserved.</sub>
|
|
239
|
+
|
|
240
|
+
</div>
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Copyright (C) 2026 Embedl AB
|
|
2
|
+
|
|
3
|
+
[build-system]
|
|
4
|
+
requires = ["setuptools>=64"]
|
|
5
|
+
build-backend = "setuptools.build_meta"
|
|
6
|
+
|
|
7
|
+
[project]
|
|
8
|
+
name = "flash-head"
|
|
9
|
+
dynamic = ["version"]
|
|
10
|
+
description = "FlashHead: vLLM Plugin for Fast Language Model Head Inference"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
dependencies = [] # torch, safetensors, huggingface_hub provided by vLLM
|
|
13
|
+
|
|
14
|
+
[project.entry-points."vllm.general_plugins"]
|
|
15
|
+
flash_head = "flash_head:register"
|
|
16
|
+
|
|
17
|
+
[tool.setuptools.dynamic]
|
|
18
|
+
version = {attr = "flash_head._version.__version__"}
|
|
19
|
+
|
|
20
|
+
[tool.setuptools.packages.find]
|
|
21
|
+
where = ["src"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Copyright (C) 2026 Embedl AB
|
|
2
|
+
|
|
3
|
+
"""FlashHead: fast approximate language model head as a vLLM plugin.
|
|
4
|
+
|
|
5
|
+
FlashHead replaces the standard lm_head (final vocabulary projection layer)
|
|
6
|
+
with a clustering-based approximation that only evaluates logits for tokens
|
|
7
|
+
in the top-k most similar clusters, delivering significant speedups on
|
|
8
|
+
edge devices.
|
|
9
|
+
|
|
10
|
+
This package integrates FlashHead into vLLM via the official plugin system
|
|
11
|
+
(vllm.general_plugins entry point), eliminating the need for source patches
|
|
12
|
+
or custom Docker images.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
from flash_head._version import __version__
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_patches_applied = False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def register():
|
|
26
|
+
"""vLLM plugin entry point. Called in every process before model init."""
|
|
27
|
+
global _patches_applied
|
|
28
|
+
if _patches_applied:
|
|
29
|
+
return
|
|
30
|
+
_patches_applied = True
|
|
31
|
+
|
|
32
|
+
if os.environ.get("FLASHHEAD_ENABLED", "1") == "0":
|
|
33
|
+
logger.info("[FlashHead] Disabled via FLASHHEAD_ENABLED=0")
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
from flash_head.patches import apply_all
|
|
37
|
+
|
|
38
|
+
apply_all()
|
|
39
|
+
_register_architectures()
|
|
40
|
+
logger.info("[FlashHead] Plugin registered")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _register_architectures():
|
|
44
|
+
"""Register FlashHead model architectures so vLLM recognizes them.
|
|
45
|
+
|
|
46
|
+
Models published with architectures like 'FlashHeadQwen3VLForConditionalGeneration'
|
|
47
|
+
will fail to load without this plugin -- giving a clear error instead of
|
|
48
|
+
silently falling back to the slow standard lm_head path.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
from vllm import ModelRegistry
|
|
52
|
+
|
|
53
|
+
# Map FlashHead architecture names to their base vLLM model classes.
|
|
54
|
+
# Uses lazy string imports to avoid premature CUDA initialization.
|
|
55
|
+
# The FlashHead interception happens via the LogitsProcessor patch,
|
|
56
|
+
# not via a custom model class, so we just need vLLM to accept the
|
|
57
|
+
# architecture name and load the base model.
|
|
58
|
+
_FLASHHEAD_ARCHITECTURES = {
|
|
59
|
+
"FlashHeadLlamaForCausalLM": "vllm.model_executor.models.llama:LlamaForCausalLM",
|
|
60
|
+
"FlashHeadQwen3ForCausalLM": "vllm.model_executor.models.qwen3:Qwen3ForCausalLM",
|
|
61
|
+
"FlashHeadQwen3VLForConditionalGeneration": "vllm.model_executor.models.qwen3_vl:Qwen3VLForConditionalGeneration",
|
|
62
|
+
"FlashHeadGemma3ForCausalLM": "vllm.model_executor.models.gemma2:Gemma2ForCausalLM",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
supported = ModelRegistry.get_supported_archs()
|
|
66
|
+
for fh_arch, model_cls_path in _FLASHHEAD_ARCHITECTURES.items():
|
|
67
|
+
if fh_arch not in supported:
|
|
68
|
+
ModelRegistry.register_model(fh_arch, model_cls_path)
|
|
69
|
+
logger.info("[FlashHead] Registered architecture %s", fh_arch)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.debug("[FlashHead] Architecture registration skipped: %s", e)
|