lemonade-sdk 8.1.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -263,8 +263,10 @@ class LemonadeTray(SystemTray):
263
263
  self.server.uvicorn_server.should_exit = True
264
264
  self.server_thread.join(timeout=2)
265
265
 
266
- # Update the port
266
+ # Update the port in both the tray and the server instance
267
267
  self.port = new_port
268
+ if self.server:
269
+ self.server.port = new_port
268
270
 
269
271
  # Restart the server
270
272
  self.server_thread = threading.Thread(target=self.start_server, daemon=True)
@@ -425,7 +427,7 @@ class LemonadeTray(SystemTray):
425
427
  Start the uvicorn server.
426
428
  """
427
429
  self.server = self.server_factory()
428
- self.server.uvicorn_server = self.server.run_in_thread(port=self.port)
430
+ self.server.uvicorn_server = self.server.run_in_thread()
429
431
  self.server.uvicorn_server.run()
430
432
 
431
433
  def run(self):
@@ -26,7 +26,7 @@ class ServerRunner(threading.Thread):
26
26
  def run(self):
27
27
  try:
28
28
  # Create the server instance
29
- self.server = Server()
29
+ self.server = Server(port=self.port, log_level="warning")
30
30
 
31
31
  # Configure the server with model/tokenizer
32
32
  self.server.model = self.model
@@ -44,9 +44,7 @@ class ServerRunner(threading.Thread):
44
44
  )
45
45
 
46
46
  # Set up the server for threaded execution
47
- self.uvicorn_server = self.server.run_in_thread(
48
- port=self.port, host=self.host, log_level="warning"
49
- )
47
+ self.uvicorn_server = self.server.run_in_thread(host=self.host)
50
48
 
51
49
  # Set the ready event
52
50
  self.ready_event.set()
lemonade/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "8.1.0"
1
+ __version__ = "8.1.1"
@@ -451,6 +451,12 @@ class Install:
451
451
  choices=["0.6.0"],
452
452
  )
453
453
 
454
+ parser.add_argument(
455
+ "--llamacpp",
456
+ help="Install llama.cpp binaries with specified backend",
457
+ choices=["rocm", "vulkan"],
458
+ )
459
+
454
460
  return parser
455
461
 
456
462
  @staticmethod
@@ -739,18 +745,32 @@ class Install:
739
745
 
740
746
  print(f"\nQuark installed successfully at: {quark_path}")
741
747
 
748
+ @staticmethod
749
+ def _install_llamacpp(backend):
750
+ """
751
+ Install llama.cpp binaries with the specified backend.
752
+
753
+ Args:
754
+ backend: The backend to use ('rocm' or 'vulkan')
755
+ """
756
+
757
+ from lemonade.tools.llamacpp.utils import install_llamacpp
758
+
759
+ install_llamacpp(backend)
760
+
742
761
  def run(
743
762
  self,
744
763
  ryzenai: Optional[str] = None,
745
764
  build_model: Optional[str] = None,
746
765
  quark: Optional[str] = None,
766
+ llamacpp: Optional[str] = None,
747
767
  yes: bool = False,
748
768
  token: Optional[str] = None,
749
769
  ):
750
- if ryzenai is None and quark is None and models is None:
770
+ if ryzenai is None and quark is None and llamacpp is None:
751
771
  raise ValueError(
752
772
  "You must select something to install, "
753
- "for example `--ryzenai`, `--quark`, or `--models`"
773
+ "for example `--ryzenai`, `--quark`, or `--llamacpp`"
754
774
  )
755
775
 
756
776
  if ryzenai is not None:
@@ -759,6 +779,9 @@ class Install:
759
779
  if quark is not None:
760
780
  self._install_quark(quark)
761
781
 
782
+ if llamacpp is not None:
783
+ self._install_llamacpp(llamacpp)
784
+
762
785
 
763
786
  def main():
764
787
  installer = Install()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.1.0
3
+ Version: 8.1.1
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.13
@@ -27,7 +27,8 @@ Requires-Dist: transformers<=4.53.2
27
27
  Requires-Dist: jinja2
28
28
  Requires-Dist: tabulate
29
29
  Requires-Dist: sentencepiece
30
- Requires-Dist: huggingface-hub==0.33.0
30
+ Requires-Dist: huggingface-hub[hf_xet]==0.33.0
31
+ Requires-Dist: python-dotenv
31
32
  Provides-Extra: oga-ryzenai
32
33
  Requires-Dist: onnxruntime-genai-directml-ryzenai==0.7.0.2; extra == "oga-ryzenai"
33
34
  Requires-Dist: protobuf>=6.30.1; extra == "oga-ryzenai"
@@ -40,6 +41,7 @@ Requires-Dist: accelerate; extra == "dev"
40
41
  Requires-Dist: datasets; extra == "dev"
41
42
  Requires-Dist: pandas>=1.5.3; extra == "dev"
42
43
  Requires-Dist: matplotlib; extra == "dev"
44
+ Requires-Dist: model-generate==1.5.0; (platform_system == "Windows" and python_version == "3.10") and extra == "dev"
43
45
  Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
44
46
  Requires-Dist: lm-eval[api]; extra == "dev"
45
47
  Provides-Extra: oga-hybrid
@@ -136,7 +138,9 @@ Dynamic: summary
136
138
  <a href="https://discord.gg/5xXzkMu8Zk">Discord</a>
137
139
  </h3>
138
140
 
139
- Lemonade makes it easy to run Large Language Models (LLMs) on your PC. Our focus is using the best tools, such as neural processing units (NPUs) and Vulkan GPU acceleration, to maximize LLM speed and responsiveness.
141
+ Lemonade helps users run local LLMs with the highest performance by configuring state-of-the-art inference engines for their NPUs and GPUs.
142
+
143
+ Startups such as [Styrk AI](https://styrk.ai/styrk-ai-and-amd-guardrails-for-your-on-device-ai-revolution/), research teams like [Hazy Research at Stanford](https://www.amd.com/en/developer/resources/technical-articles/2025/minions--on-device-and-cloud-language-model-collaboration-on-ryz.html), and large companies like [AMD](https://www.amd.com/en/developer/resources/technical-articles/unlocking-a-wave-of-llm-apps-on-ryzen-ai-through-lemonade-server.html) use Lemonade to run LLMs.
140
144
 
141
145
  ## Getting Started
142
146
 
@@ -155,7 +159,7 @@ Lemonade makes it easy to run Large Language Models (LLMs) on your PC. Our focus
155
159
  </p>
156
160
 
157
161
  > [!TIP]
158
- > Want your app featured here? Let's do it! Shoot us a message on [Discord](https://discord.gg/5xXzkMu8Zk), [create an issue](https://github.com/lemonade-sdk/lemonade/issues), or email lemonade@amd.com.
162
+ > Want your app featured here? Let's do it! Shoot us a message on [Discord](https://discord.gg/5xXzkMu8Zk), [create an issue](https://github.com/lemonade-sdk/lemonade/issues), or [email](lemonade@amd.com).
159
163
 
160
164
  ## Using the CLI
161
165
 
@@ -177,7 +181,10 @@ To check all models available, use the `list` command:
177
181
  lemonade-server list
178
182
  ```
179
183
 
180
- > Note: If you installed from source, use the `lemonade-server-dev` command instead.
184
+ > **Note**: If you installed from source, use the `lemonade-server-dev` command instead.
185
+
186
+ > **Tip**: You can use `--llamacpp vulkan/rocm` to select a backend when running GGUF models.
187
+
181
188
 
182
189
  ## Model Library
183
190
 
@@ -219,7 +226,7 @@ Lemonade supports the following configurations, while also making it easy to swi
219
226
  <tr>
220
227
  <td><strong>🎮 GPU</strong></td>
221
228
  <td align="center">—</td>
222
- <td align="center">Vulkan: All platforms<br><small>Focus:<br/>Ryzen™ AI 7000/8000/300<br/>Radeon™ 7000/9000</small></td>
229
+ <td align="center">Vulkan: All platforms<br>ROCm: Selected AMD platforms*</td>
223
230
  <td align="center">—</td>
224
231
  <td align="center">✅</td>
225
232
  <td align="center">✅</td>
@@ -235,6 +242,38 @@ Lemonade supports the following configurations, while also making it easy to swi
235
242
  </tbody>
236
243
  </table>
237
244
 
245
+ <details>
246
+ <summary><small><i>* See supported AMD ROCm platforms</i></small></summary>
247
+
248
+ <br>
249
+
250
+ <table>
251
+ <thead>
252
+ <tr>
253
+ <th>Architecture</th>
254
+ <th>Platform Support</th>
255
+ <th>GPU Models</th>
256
+ </tr>
257
+ </thead>
258
+ <tbody>
259
+ <tr>
260
+ <td><b>gfx1151</b> (STX Halo)</td>
261
+ <td>Windows, Ubuntu</td>
262
+ <td>Ryzen AI MAX+ Pro 395</td>
263
+ </tr>
264
+ <tr>
265
+ <td><b>gfx120X</b> (RDNA4)</td>
266
+ <td>Windows only</td>
267
+ <td>Radeon AI PRO R9700, RX 9070 XT/GRE/9070, RX 9060 XT</td>
268
+ </tr>
269
+ <tr>
270
+ <td><b>gfx110X</b> (RDNA3)</td>
271
+ <td>Windows, Ubuntu</td>
272
+ <td>Radeon PRO W7900/W7800/W7700/V710, RX 7900 XTX/XT/GRE, RX 7800 XT, RX 7700 XT</td>
273
+ </tr>
274
+ </tbody>
275
+ </table>
276
+ </details>
238
277
 
239
278
  ## Integrate Lemonade Server with Your Application
240
279
 
@@ -4,17 +4,17 @@ lemonade/cache.py,sha256=5iZbk273TiTMqK_vdzPOPYTo6VsWW2gNByOISA9zi1w,3002
4
4
  lemonade/cli.py,sha256=9Pcs3PcrWC2F8_pcBaz09xHUICIJTvpemBdPGyXkjIk,4395
5
5
  lemonade/sequence.py,sha256=KSH7BPsiyDKsOsg_ziQKEGsDwMmuO_YbgPRBxkZd0pw,13267
6
6
  lemonade/state.py,sha256=sdSezla7Cd7KYL90xY3p9kcNV4ndSyN6UvNLOr3vBMA,5261
7
- lemonade/version.py,sha256=c04nFsyfS0zYoDvZjLO-uEi12TFB5EWSD6fiWiI7OLQ,22
7
+ lemonade/version.py,sha256=8YlEPKK1Cm5T4dPa2BQPpPwVVTzjPLnmqAeNcTb5nOw,22
8
8
  lemonade/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  lemonade/common/build.py,sha256=zTb0m1-kuUx6zw5QHp2SNnVuN6jOTMQ2FCdj9iH374U,6140
10
10
  lemonade/common/cli_helpers.py,sha256=hjBfXrTtFl8gmCFlL-ksviXR0mOcdPtTWVNKoEp3PG4,4993
11
11
  lemonade/common/exceptions.py,sha256=w83sVKmL1QXoJlGjj_bRyjIBMhlMqdVQy_FEOTu2YQI,2050
12
12
  lemonade/common/filesystem.py,sha256=QV3cHhKNu-7W2rr8wZ4JQfD2rP_5T2Js7jiDQBYWHVQ,12142
13
- lemonade/common/inference_engines.py,sha256=lcmir_pATr71TfSBJoIZEi3G9xyxNwi2_xpPvPD8_xI,12932
13
+ lemonade/common/inference_engines.py,sha256=OJQcED9P1ZeQ8d11lDMNeAoaFoUuZlsDcwEZXLbqWRg,12579
14
14
  lemonade/common/network.py,sha256=p1lWJkN0H5hCpb4rKi3Zc47W_BRrrm-7ghdTALJLGqU,1944
15
15
  lemonade/common/printing.py,sha256=GFFzrXIineIOMa9yu0lo5sL4j6A5BBg_T9aUCdP-juw,3229
16
16
  lemonade/common/status.py,sha256=xSOZN508cdRtrs1HVyr9zmASYg69EsZBLSs0lroLoCM,16519
17
- lemonade/common/system_info.py,sha256=dOtX8WLHCz1xmURZWnqhDbyNZv_AulrpX_bbI58eHFQ,27084
17
+ lemonade/common/system_info.py,sha256=pn-k3zMQCbt5cu3aHXa4cENgrubOK97gs9PYdGPsFXA,28405
18
18
  lemonade/common/test_helpers.py,sha256=Gwk-pa_6xYAo2oro-2EJNfuouAfw8k_brCbcMC-E-r0,758
19
19
  lemonade/profilers/__init__.py,sha256=JKVonvJ4XZ9_6sKXPWsiMLQCNyzQOxhQw5BEHR1qOfU,31
20
20
  lemonade/profilers/memory_tracker.py,sha256=1iuKt0FmNVYLDnOc-oZM8dX9TUksvoxO0m2EoYWjhYQ,9367
@@ -33,11 +33,11 @@ lemonade/tools/huggingface/bench.py,sha256=-mTfldCtquL4mspq8ykVwDc9Mut5Ecv_jHJnS
33
33
  lemonade/tools/huggingface/load.py,sha256=KsSGOBBD-tNEIfYC8mCWV_jpnkjHMhN3juVmC1Ln4uQ,7745
34
34
  lemonade/tools/huggingface/utils.py,sha256=j1S-IgjDsznUIVwkHSqqChmFyqIx9f3WcEelzohWwvU,13955
35
35
  lemonade/tools/llamacpp/bench.py,sha256=1fkE02ecg-jRk92i5dTAXz6re14WH8bd-Z9l-m3lbDA,4844
36
- lemonade/tools/llamacpp/load.py,sha256=SKacK2n8LpC4DN4yALyEpV2c8_sgOv2G7t6Nlyu7XXg,6273
37
- lemonade/tools/llamacpp/utils.py,sha256=vHA5kykkdHSsMGmbEA4RyOHr8wFIh1WenfhCvY8WxZs,22445
36
+ lemonade/tools/llamacpp/load.py,sha256=DFCvQN548Ch9H8U_rHOiYviinzw6vixb5-V7xLj7XE4,6499
37
+ lemonade/tools/llamacpp/utils.py,sha256=CTWnzbEYGPSbOizF26yCnyNrHDY19pLusU-YyND992s,29070
38
38
  lemonade/tools/oga/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
39
  lemonade/tools/oga/bench.py,sha256=PJXv4UchcS2YPwijNzef8DY4DSAKYxIYY1ycHuH3T34,5005
40
- lemonade/tools/oga/load.py,sha256=O82ezF7Jhgz3CJrxDWZYqLHyD_0NS1nsvfMWDaaUI4I,33728
40
+ lemonade/tools/oga/load.py,sha256=6Pf_QrHpIXDbfpTwFNRj4RmWTxI-RImhYuqRvmTVgmY,33722
41
41
  lemonade/tools/oga/utils.py,sha256=Xd7tmNr69u_bCut0hZqA7saUR3NFZlp4bvWo54mOZb0,16918
42
42
  lemonade/tools/quark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
43
  lemonade/tools/quark/quark_load.py,sha256=FJ4LJKTToZbHHWVEOBLadae1a3jCnnY4KvXySHbkJMA,5589
@@ -46,27 +46,27 @@ lemonade/tools/report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
46
46
  lemonade/tools/report/llm_report.py,sha256=bVHhwCINA-Ok2EdSwAsLubsc83N3KWOVuwTguw7jDcE,6676
47
47
  lemonade/tools/report/table.py,sha256=ssqy1bZqF-wptNzKEOj6_9REtCNZyXO8R5vakAtg3R4,27973
48
48
  lemonade/tools/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- lemonade/tools/server/llamacpp.py,sha256=OP0j74QcowEu3zFEcrKIsBbGDOFemBXS5F5DC6oQHaI,18853
50
- lemonade/tools/server/serve.py,sha256=0-NprfsU-YrX8Qsf1atEi6wPJWemrPjHKEBHV69SwCQ,57046
49
+ lemonade/tools/server/llamacpp.py,sha256=KZO4npzefvbaPvlZbpCYsdW0tMSfmmupT8gaK9y65I8,17962
50
+ lemonade/tools/server/serve.py,sha256=PAAGowj2Z5AQIW3G1l52taNyf_0U4kRFR3G735M4DsU,55513
51
51
  lemonade/tools/server/tool_calls.py,sha256=xrAlQwKG-nv2xLlf8f9CDSaUbyMn8ZtHkds9iZLG9K8,5230
52
- lemonade/tools/server/tray.py,sha256=yoGCM8j_2KzPqo-AlYiauWd8QR56yp6jW6HZ9921Ydg,17525
52
+ lemonade/tools/server/tray.py,sha256=qlQKBkQwG9W2v9GTyycvFc12_jly6vPU1uEkrIFBGTs,17624
53
53
  lemonade/tools/server/webapp.py,sha256=8Das5yXOaSBLZmSZ_eddJajQFxBhvl5D6GI_hHlGbE0,1040
54
54
  lemonade/tools/server/static/favicon.ico,sha256=hMmP9qGJNeZ0mFS86JIqPbZstXMZn0Z76_HfHQpREAU,126745
55
- lemonade/tools/server/static/styles.css,sha256=8wQ5Cg4rbEh03kC8t7ALE7dB20GiD0Pfu5BAxh9hECU,26429
56
- lemonade/tools/server/static/webapp.html,sha256=KZm1ZFIhQzLT2Y2wy3hFsQxcOxFzv-blaeLzc1ODhb8,36396
55
+ lemonade/tools/server/static/styles.css,sha256=M_JrH_vML65MWun-C8XCvLOFw35qZURSa77Fk4fVngQ,30029
56
+ lemonade/tools/server/static/webapp.html,sha256=oU6FZHGQCq-SoT6VkWObQvYzzNS0ser5Fmqx2j_5jCI,54380
57
57
  lemonade/tools/server/utils/port.py,sha256=XnIg2qS73QRrsJn6LgHcrJPmku30Tv6vsYcBVMj82K4,2186
58
58
  lemonade/tools/server/utils/system_tray.py,sha256=b9lvNv9chJKQxvmH7qzAuUe6H9HsLu7pdHFqGlAJaL0,12654
59
- lemonade/tools/server/utils/thread.py,sha256=pK9K_6DNWoQ78NArkAX3Ym2WsxLnCs9sKTk6TitlYnI,2804
59
+ lemonade/tools/server/utils/thread.py,sha256=Z-PDzGcpgfN2qxTmtlROWqrUN0B2fXdPrqo_J10fR_w,2772
60
60
  lemonade_install/__init__.py,sha256=26zohKg2jgr_5y7tObduWMYQg8zCTWMZHL8lfi2zZVQ,40
61
- lemonade_install/install.py,sha256=TBX-VwEHcPo4WX0K_12pKKINnIK3o4SUo3L5XjkqEtw,27669
62
- lemonade_sdk-8.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
63
- lemonade_sdk-8.1.0.dist-info/licenses/NOTICE.md,sha256=B8lEqi4QE41J9ljz4Riv2JgHD1v8GCZE6nNBHO3KIA0,2135
64
- lemonade_server/cli.py,sha256=6QJ5fxNLuVUbuHauA5JHXf0H5dqJ5E4GNTo4YoMOJtg,16049
61
+ lemonade_install/install.py,sha256=Zl_JtEIhbqZZTvxcqtq895IomEN-JNxp9xOZEtahMHQ,28289
62
+ lemonade_sdk-8.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
63
+ lemonade_sdk-8.1.1.dist-info/licenses/NOTICE.md,sha256=B8lEqi4QE41J9ljz4Riv2JgHD1v8GCZE6nNBHO3KIA0,2135
64
+ lemonade_server/cli.py,sha256=CFfhrRgZNJCd0rDRBF3TeS3dMJgwlKGtvT5_kbsWaXk,17316
65
65
  lemonade_server/model_manager.py,sha256=O3fIX52AqU0z10WzPmNEA3lQ_KjOqNq_G-SxjwIgEio,10781
66
66
  lemonade_server/pydantic_models.py,sha256=qEvF7x7AuHCHMiByVzGGuLdQTNs233Sw9uQq5cpI6is,2721
67
- lemonade_server/server_models.json,sha256=gitKHj_VHANxjtcXeE5zFpukVO0HyEfKhu3ZaZsj2xo,8867
68
- lemonade_sdk-8.1.0.dist-info/METADATA,sha256=c3JxCUYw5ujhGSb3FX3mG6UmgG5BLqik8a5j4oe8n7o,15712
69
- lemonade_sdk-8.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
70
- lemonade_sdk-8.1.0.dist-info/entry_points.txt,sha256=gJppn0ETtXXR6ceKWEIRdk42kMC7ps59EmU3NCPyPUk,144
71
- lemonade_sdk-8.1.0.dist-info/top_level.txt,sha256=10ap5GNiPhalO4V50LRoxA1FqRT9g3Xkia6BITu880k,42
72
- lemonade_sdk-8.1.0.dist-info/RECORD,,
67
+ lemonade_server/server_models.json,sha256=iag_dG9S1tkHZUhkJmGAfiUJkgEazdQSv7stC1fVAsQ,9741
68
+ lemonade_sdk-8.1.1.dist-info/METADATA,sha256=XT9cwNUAkhwQ6kad6l7t2nj7m8S0t-9GvaFLOMxLCyE,17065
69
+ lemonade_sdk-8.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
70
+ lemonade_sdk-8.1.1.dist-info/entry_points.txt,sha256=gJppn0ETtXXR6ceKWEIRdk42kMC7ps59EmU3NCPyPUk,144
71
+ lemonade_sdk-8.1.1.dist-info/top_level.txt,sha256=10ap5GNiPhalO4V50LRoxA1FqRT9g3Xkia6BITu880k,42
72
+ lemonade_sdk-8.1.1.dist-info/RECORD,,
lemonade_server/cli.py CHANGED
@@ -39,11 +39,19 @@ class ModelNotAvailableError(Exception):
39
39
  """
40
40
 
41
41
 
42
+ class ModelLoadError(Exception):
43
+ """
44
+ The model failed to load on the server
45
+ """
46
+
47
+
42
48
  def serve(
43
49
  port: int = None,
44
50
  log_level: str = None,
45
51
  tray: bool = False,
46
52
  use_thread: bool = False,
53
+ llamacpp_backend: str = None,
54
+ ctx_size: int = None,
47
55
  ):
48
56
  """
49
57
  Execute the serve command
@@ -51,26 +59,33 @@ def serve(
51
59
 
52
60
  # Otherwise, start the server
53
61
  print("Starting Lemonade Server...")
54
- from lemonade.tools.server.serve import Server, DEFAULT_PORT, DEFAULT_LOG_LEVEL
62
+ from lemonade.tools.server.serve import (
63
+ Server,
64
+ DEFAULT_PORT,
65
+ DEFAULT_LOG_LEVEL,
66
+ DEFAULT_LLAMACPP_BACKEND,
67
+ DEFAULT_CTX_SIZE,
68
+ )
55
69
 
56
70
  port = port if port is not None else DEFAULT_PORT
57
71
  log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
72
+ llamacpp_backend = (
73
+ llamacpp_backend if llamacpp_backend is not None else DEFAULT_LLAMACPP_BACKEND
74
+ )
58
75
 
59
- # Hidden environment variable to enable input truncation (experimental feature)
60
- truncate_inputs = os.environ.get("LEMONADE_TRUNCATE_INPUTS", None)
76
+ # Use ctx_size if provided, otherwise use default
77
+ ctx_size = ctx_size if ctx_size is not None else DEFAULT_CTX_SIZE
61
78
 
62
79
  # Start the server
63
- serve_kwargs = {
64
- "log_level": log_level,
65
- "truncate_inputs": truncate_inputs,
66
- "tray": tray,
67
- }
68
- server = Server()
80
+ server = Server(
81
+ port=port,
82
+ log_level=log_level,
83
+ ctx_size=ctx_size,
84
+ tray=tray,
85
+ llamacpp_backend=llamacpp_backend,
86
+ )
69
87
  if not use_thread:
70
- server.run(
71
- port=port,
72
- **serve_kwargs,
73
- )
88
+ server.run()
74
89
  else:
75
90
  from threading import Thread
76
91
  import time
@@ -78,8 +93,6 @@ def serve(
78
93
  # Start a background thread to run the server
79
94
  server_thread = Thread(
80
95
  target=server.run,
81
- args=(port,),
82
- kwargs=serve_kwargs,
83
96
  daemon=True,
84
97
  )
85
98
  server_thread.start()
@@ -243,7 +256,13 @@ def delete(model_names: List[str]):
243
256
  ModelManager().delete_model(model_name)
244
257
 
245
258
 
246
- def run(model_name: str):
259
+ def run(
260
+ model_name: str,
261
+ port: int = None,
262
+ log_level: str = None,
263
+ llamacpp_backend: str = None,
264
+ ctx_size: int = None,
265
+ ):
247
266
  """
248
267
  Start the server if not running and open the webapp with the specified model
249
268
  """
@@ -254,7 +273,16 @@ def run(model_name: str):
254
273
  _, port = get_server_info()
255
274
  server_previously_running = port is not None
256
275
  if not server_previously_running:
257
- port, server_thread = serve(use_thread=True, tray=True, log_level="info")
276
+ port, server_thread = serve(
277
+ port=port,
278
+ log_level=log_level,
279
+ tray=True,
280
+ use_thread=True,
281
+ llamacpp_backend=llamacpp_backend,
282
+ ctx_size=ctx_size,
283
+ )
284
+ else:
285
+ port = running_port
258
286
 
259
287
  # Pull model
260
288
  pull([model_name])
@@ -412,6 +440,29 @@ def list_models():
412
440
  print(tabulate(table_data, headers=headers, tablefmt="simple"))
413
441
 
414
442
 
443
+ def _add_server_arguments(parser):
444
+ """Add common server arguments to a parser"""
445
+ parser.add_argument("--port", type=int, help="Port number to serve on")
446
+ parser.add_argument(
447
+ "--log-level",
448
+ type=str,
449
+ help="Log level for the server",
450
+ choices=["critical", "error", "warning", "info", "debug", "trace"],
451
+ default="info",
452
+ )
453
+ parser.add_argument(
454
+ "--llamacpp",
455
+ type=str,
456
+ help=f"LlamaCpp backend to use",
457
+ choices=["vulkan", "rocm"],
458
+ )
459
+ parser.add_argument(
460
+ "--ctx-size",
461
+ type=int,
462
+ help="Context size for the model (default: 4096 for llamacpp, truncates prompts for other recipes)",
463
+ )
464
+
465
+
415
466
  def main():
416
467
  parser = argparse.ArgumentParser(
417
468
  description="Serve LLMs on CPU, GPU, and NPU.",
@@ -430,14 +481,7 @@ def main():
430
481
 
431
482
  # Serve command
432
483
  serve_parser = subparsers.add_parser("serve", help="Start server")
433
- serve_parser.add_argument("--port", type=int, help="Port number to serve on")
434
- serve_parser.add_argument(
435
- "--log-level",
436
- type=str,
437
- help="Log level for the server",
438
- choices=["critical", "error", "warning", "info", "debug", "trace"],
439
- default="info",
440
- )
484
+ _add_server_arguments(serve_parser)
441
485
  if os.name == "nt":
442
486
  serve_parser.add_argument(
443
487
  "--no-tray",
@@ -513,6 +557,7 @@ def main():
513
557
  "model",
514
558
  help="Lemonade Server model name to run",
515
559
  )
560
+ _add_server_arguments(run_parser)
516
561
 
517
562
  args = parser.parse_args()
518
563
 
@@ -535,6 +580,8 @@ def main():
535
580
  port=args.port,
536
581
  log_level=args.log_level,
537
582
  tray=not args.no_tray,
583
+ llamacpp_backend=args.llamacpp,
584
+ ctx_size=args.ctx_size,
538
585
  )
539
586
  elif args.command == "status":
540
587
  status()
@@ -553,7 +600,13 @@ def main():
553
600
  elif args.command == "stop":
554
601
  stop()
555
602
  elif args.command == "run":
556
- run(args.model)
603
+ run(
604
+ args.model,
605
+ port=args.port,
606
+ log_level=args.log_level,
607
+ llamacpp_backend=args.llamacpp,
608
+ ctx_size=args.ctx_size,
609
+ )
557
610
  elif args.command == "help" or not args.command:
558
611
  parser.print_help()
559
612
 
@@ -190,7 +190,13 @@
190
190
  "checkpoint": "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF:Qwen3-30B-A3B-Instruct-2507-Q4_0.gguf",
191
191
  "recipe": "llamacpp",
192
192
  "suggested": true,
193
- "labels": ["coding"]
193
+ "labels": ["hot"]
194
+ },
195
+ "Qwen3-Coder-30B-A3B-Instruct-GGUF": {
196
+ "checkpoint": "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf",
197
+ "recipe": "llamacpp",
198
+ "suggested": true,
199
+ "labels": ["coding","hot"]
194
200
  },
195
201
  "Gemma-3-4b-it-GGUF": {
196
202
  "checkpoint": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
@@ -213,6 +219,13 @@
213
219
  "suggested": true,
214
220
  "labels": ["vision"]
215
221
  },
222
+ "Cogito-v2-llama-109B-MoE-GGUF": {
223
+ "checkpoint": "unsloth/cogito-v2-preview-llama-109B-MoE-GGUF:Q4_K_M",
224
+ "mmproj": "mmproj-F16.gguf",
225
+ "recipe": "llamacpp",
226
+ "suggested": true,
227
+ "labels": ["vision","hot"]
228
+ },
216
229
  "nomic-embed-text-v1-GGUF": {
217
230
  "checkpoint": "nomic-ai/nomic-embed-text-v1-GGUF:Q4_K_S",
218
231
  "recipe": "llamacpp",
@@ -248,5 +261,17 @@
248
261
  "recipe": "llamacpp",
249
262
  "suggested": true,
250
263
  "labels": ["reasoning", "coding"]
264
+ },
265
+ "gpt-oss-120b-GGUF": {
266
+ "checkpoint": "unsloth/gpt-oss-120b-GGUF:Q4_K_M",
267
+ "recipe": "llamacpp",
268
+ "suggested": true,
269
+ "labels": ["hot", "reasoning"]
270
+ },
271
+ "gpt-oss-20b-GGUF": {
272
+ "checkpoint": "unsloth/gpt-oss-20b-GGUF:Q4_K_M",
273
+ "recipe": "llamacpp",
274
+ "suggested": true,
275
+ "labels": ["hot", "reasoning"]
251
276
  }
252
277
  }