lemonade-sdk 8.0.6__py3-none-any.whl → 8.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -2,7 +2,6 @@ import os
2
2
  import sys
3
3
  import importlib.util
4
4
  import importlib.metadata
5
- import platform
6
5
  import subprocess
7
6
  from abc import ABC, abstractmethod
8
7
  from typing import Dict, Optional
@@ -19,7 +18,9 @@ class InferenceEngineDetector:
19
18
  self.llamacpp_detector = LlamaCppDetector()
20
19
  self.transformers_detector = TransformersDetector()
21
20
 
22
- def detect_engines_for_device(self, device_type: str) -> Dict[str, Dict]:
21
+ def detect_engines_for_device(
22
+ self, device_type: str, device_name: str
23
+ ) -> Dict[str, Dict]:
23
24
  """
24
25
  Detect all available inference engines for a specific device type.
25
26
 
@@ -36,10 +37,19 @@ class InferenceEngineDetector:
36
37
  if oga_info:
37
38
  engines["oga"] = oga_info
38
39
 
39
- # Detect llama.cpp availability
40
- llamacpp_info = self.llamacpp_detector.detect_for_device(device_type)
40
+ # Detect llama.cpp vulkan availability
41
+ llamacpp_info = self.llamacpp_detector.detect_for_device(
42
+ device_type, device_name, "vulkan"
43
+ )
44
+ if llamacpp_info:
45
+ engines["llamacpp-vulkan"] = llamacpp_info
46
+
47
+ # Detect llama.cpp rocm availability
48
+ llamacpp_info = self.llamacpp_detector.detect_for_device(
49
+ device_type, device_name, "rocm"
50
+ )
41
51
  if llamacpp_info:
42
- engines["llamacpp"] = llamacpp_info
52
+ engines["llamacpp-rocm"] = llamacpp_info
43
53
 
44
54
  # Detect Transformers availability
45
55
  transformers_info = self.transformers_detector.detect_for_device(device_type)
@@ -206,57 +216,40 @@ class LlamaCppDetector(BaseEngineDetector):
206
216
  Detector for llama.cpp.
207
217
  """
208
218
 
209
- def detect_for_device(self, device_type: str) -> Optional[Dict]:
219
+ def detect_for_device(
220
+ self, device_type: str, device_name: str, backend: str
221
+ ) -> Optional[Dict]:
210
222
  """
211
223
  Detect llama.cpp availability for specific device.
212
224
  """
213
225
  try:
214
- # Map device types to llama.cpp backends
215
- device_backend_map = {
216
- "cpu": "cpu",
217
- "amd_igpu": "vulkan",
218
- "amd_dgpu": "vulkan",
219
- }
220
226
 
221
- if device_type not in device_backend_map:
227
+ if device_type not in ["cpu", "amd_igpu", "amd_dgpu"]:
222
228
  return None
223
229
 
224
- backend = device_backend_map[device_type]
225
- is_installed = self.is_installed()
226
-
227
- # Check requirements based on backend
228
- if backend == "vulkan":
229
- vulkan_available = self._check_vulkan_support()
230
- if not vulkan_available:
231
- return {"available": False, "error": "Vulkan not available"}
232
-
233
- # Vulkan is available
234
- if is_installed:
235
- result = {
236
- "available": True,
237
- "version": self._get_llamacpp_version(),
238
- "backend": backend,
239
- }
240
- return result
241
- else:
242
- return {
243
- "available": False,
244
- "error": "llama.cpp binaries not installed",
245
- }
246
- else:
247
- # CPU backend
248
- if is_installed:
249
- result = {
250
- "available": True,
251
- "version": self._get_llamacpp_version(),
252
- "backend": backend,
253
- }
254
- return result
255
- else:
256
- return {
257
- "available": False,
258
- "error": "llama.cpp binaries not installed",
259
- }
230
+ # Check if the device is supported by the backend
231
+ if device_type == "cpu":
232
+ device_supported = True
233
+ elif device_type == "amd_igpu" or device_type == "amd_dgpu":
234
+ if backend == "vulkan":
235
+ device_supported = self._check_vulkan_support()
236
+ elif backend == "rocm":
237
+ device_supported = self._check_rocm_support(device_name.lower())
238
+ if not device_supported:
239
+ return {"available": False, "error": f"{backend} not available"}
240
+
241
+ is_installed = self.is_installed(backend)
242
+ if not is_installed:
243
+ return {
244
+ "available": False,
245
+ "error": f"{backend} binaries not installed",
246
+ }
247
+
248
+ return {
249
+ "available": True,
250
+ "version": self._get_llamacpp_version(backend),
251
+ "backend": backend,
252
+ }
260
253
 
261
254
  except (ImportError, OSError, subprocess.SubprocessError) as e:
262
255
  return {
@@ -264,35 +257,17 @@ class LlamaCppDetector(BaseEngineDetector):
264
257
  "error": f"llama.cpp detection failed: {str(e)}",
265
258
  }
266
259
 
267
- def is_installed(self) -> bool:
260
+ def is_installed(self, backend: str) -> bool:
268
261
  """
269
- Check if llama.cpp binaries are available.
262
+ Check if llama.cpp binaries are available for any backend.
270
263
  """
264
+ from lemonade.tools.llamacpp.utils import get_llama_server_exe_path
271
265
 
272
- # Check lemonade-managed binary locations
273
266
  try:
274
-
275
- # Check lemonade server directory
276
- server_base_dir = os.path.join(
277
- os.path.dirname(sys.executable), "llama_server"
278
- )
279
-
280
- if platform.system().lower() == "windows":
281
- server_exe_path = os.path.join(server_base_dir, "llama-server.exe")
282
- else:
283
- # Check both build/bin and root directory locations
284
- build_bin_path = os.path.join(
285
- server_base_dir, "build", "bin", "llama-server"
286
- )
287
- root_path = os.path.join(server_base_dir, "llama-server")
288
- server_exe_path = (
289
- build_bin_path if os.path.exists(build_bin_path) else root_path
290
- )
291
-
267
+ server_exe_path = get_llama_server_exe_path(backend)
292
268
  if os.path.exists(server_exe_path):
293
269
  return True
294
-
295
- except (ImportError, OSError):
270
+ except (ImportError, OSError, ValueError):
296
271
  pass
297
272
 
298
273
  return False
@@ -334,13 +309,22 @@ class LlamaCppDetector(BaseEngineDetector):
334
309
  except OSError:
335
310
  return False
336
311
 
337
- def _get_llamacpp_version(self) -> str:
312
+ def _check_rocm_support(self, device_name: str) -> bool:
313
+ """
314
+ Check if ROCM is available for GPU acceleration.
315
+ """
316
+ from lemonade.tools.llamacpp.utils import identify_rocm_arch_from_name
317
+
318
+ return identify_rocm_arch_from_name(device_name) is not None
319
+
320
+ def _get_llamacpp_version(self, backend: str) -> str:
338
321
  """
339
- Get llama.cpp version from lemonade's managed installation.
322
+ Get llama.cpp version from lemonade's managed installation for specific backend.
340
323
  """
341
324
  try:
325
+ # Use backend-specific path - same logic as get_llama_folder_path in utils.py
342
326
  server_base_dir = os.path.join(
343
- os.path.dirname(sys.executable), "llama_server"
327
+ os.path.dirname(sys.executable), backend, "llama_server"
344
328
  )
345
329
  version_file = os.path.join(server_base_dir, "version.txt")
346
330
 
@@ -401,15 +385,16 @@ class TransformersDetector(BaseEngineDetector):
401
385
  )
402
386
 
403
387
 
404
- def detect_inference_engines(device_type: str) -> Dict[str, Dict]:
388
+ def detect_inference_engines(device_type: str, device_name: str) -> Dict[str, Dict]:
405
389
  """
406
390
  Helper function to detect inference engines for a device type.
407
391
 
408
392
  Args:
409
393
  device_type: "cpu", "amd_igpu", "amd_dgpu", or "npu"
394
+ device_name: device name
410
395
 
411
396
  Returns:
412
397
  dict: Engine availability information.
413
398
  """
414
399
  detector = InferenceEngineDetector()
415
- return detector.detect_engines_for_device(device_type)
400
+ return detector.detect_engines_for_device(device_type, device_name)
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from typing import Optional
3
3
  import socket
4
- from huggingface_hub import model_info
4
+ from huggingface_hub import model_info, snapshot_download
5
5
 
6
6
 
7
7
  def is_offline():
@@ -48,3 +48,20 @@ def get_base_model(checkpoint: str) -> Optional[str]:
48
48
  except Exception: # pylint: disable=broad-except
49
49
  pass
50
50
  return None
51
+
52
+
53
+ def custom_snapshot_download(repo_id, **kwargs):
54
+ """
55
+ Custom snapshot download with retry logic for Windows symlink privilege errors.
56
+ """
57
+ for attempt in range(2):
58
+ try:
59
+ return snapshot_download(repo_id=repo_id, **kwargs)
60
+ except OSError as e:
61
+ if (
62
+ hasattr(e, "winerror")
63
+ and e.winerror == 1314 # pylint: disable=no-member
64
+ and attempt < 1
65
+ ):
66
+ continue
67
+ raise
@@ -47,11 +47,10 @@ class SystemInfo(ABC):
47
47
  Returns:
48
48
  dict: Device information.
49
49
  """
50
-
51
50
  device_dict = {
52
51
  "cpu": self.get_cpu_device(),
53
- "amd_igpu": self.get_amd_igpu_device(),
54
- "amd_dgpu": self.get_amd_dgpu_devices(),
52
+ "amd_igpu": self.get_amd_igpu_device(include_inference_engines=True),
53
+ "amd_dgpu": self.get_amd_dgpu_devices(include_inference_engines=True),
55
54
  "npu": self.get_npu_device(),
56
55
  }
57
56
  return device_dict
@@ -66,7 +65,7 @@ class SystemInfo(ABC):
66
65
  """
67
66
 
68
67
  @abstractmethod
69
- def get_amd_igpu_device(self) -> dict:
68
+ def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
70
69
  """
71
70
  Retrieves AMD integrated GPU device information.
72
71
 
@@ -75,7 +74,7 @@ class SystemInfo(ABC):
75
74
  """
76
75
 
77
76
  @abstractmethod
78
- def get_amd_dgpu_devices(self) -> list:
77
+ def get_amd_dgpu_devices(self, include_inference_engines: bool = False) -> list:
79
78
  """
80
79
  Retrieves AMD discrete GPU device information.
81
80
 
@@ -143,8 +142,9 @@ class WindowsSystemInfo(SystemInfo):
143
142
  processors = self.connection.Win32_Processor()
144
143
  if processors:
145
144
  processor = processors[0]
145
+ cpu_name = processor.Name.strip()
146
146
  cpu_info = {
147
- "name": processor.Name.strip(),
147
+ "name": cpu_name,
148
148
  "cores": processor.NumberOfCores,
149
149
  "threads": processor.NumberOfLogicalProcessors,
150
150
  "max_clock_speed_mhz": processor.MaxClockSpeed,
@@ -152,7 +152,9 @@ class WindowsSystemInfo(SystemInfo):
152
152
  }
153
153
 
154
154
  # Add inference engine detection
155
- cpu_info["inference_engines"] = self._detect_inference_engines("cpu")
155
+ cpu_info["inference_engines"] = self._detect_inference_engines(
156
+ "cpu", cpu_name
157
+ )
156
158
  return cpu_info
157
159
 
158
160
  except Exception as e: # pylint: disable=broad-except
@@ -160,7 +162,7 @@ class WindowsSystemInfo(SystemInfo):
160
162
 
161
163
  return {"available": False, "error": "No CPU information found"}
162
164
 
163
- def _detect_amd_gpus(self, gpu_type: str):
165
+ def _detect_amd_gpus(self, gpu_type: str, include_inference_engines: bool = False):
164
166
  """
165
167
  Shared AMD GPU detection logic for both integrated and discrete GPUs.
166
168
  Uses keyword-based classification for simplicity and reliability.
@@ -194,23 +196,25 @@ class WindowsSystemInfo(SystemInfo):
194
196
  gpu_type == "discrete" and not is_integrated
195
197
  ):
196
198
 
197
- driver_version = self.get_driver_version(
198
- "AMD-OpenCL User Mode Driver"
199
- )
200
-
201
199
  device_type = "amd_igpu" if is_integrated else "amd_dgpu"
202
200
  gpu_info = {
203
201
  "name": controller.Name,
204
- "driver_version": (
205
- driver_version if driver_version else "Unknown"
206
- ),
207
202
  "available": True,
208
203
  }
209
204
 
210
- # Add inference engine detection
211
- gpu_info["inference_engines"] = self._detect_inference_engines(
212
- device_type
205
+ driver_version = self.get_driver_version(
206
+ "AMD-OpenCL User Mode Driver"
207
+ )
208
+ gpu_info["driver_version"] = (
209
+ driver_version if driver_version else "Unknown"
213
210
  )
211
+
212
+ if include_inference_engines:
213
+ gpu_info["inference_engines"] = (
214
+ self._detect_inference_engines(
215
+ device_type, controller.Name
216
+ )
217
+ )
214
218
  gpu_devices.append(gpu_info)
215
219
 
216
220
  except Exception as e: # pylint: disable=broad-except
@@ -219,32 +223,36 @@ class WindowsSystemInfo(SystemInfo):
219
223
 
220
224
  return gpu_devices
221
225
 
222
- def get_amd_igpu_device(self) -> dict:
226
+ def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
223
227
  """
224
228
  Retrieves AMD integrated GPU device information using keyword-based classification.
225
229
 
226
230
  Returns:
227
231
  dict: AMD iGPU device information.
228
232
  """
229
- igpu_devices = self._detect_amd_gpus("integrated")
233
+ igpu_devices = self._detect_amd_gpus(
234
+ "integrated", include_inference_engines=include_inference_engines
235
+ )
230
236
  return (
231
237
  igpu_devices[0]
232
238
  if igpu_devices
233
239
  else {"available": False, "error": "No AMD integrated GPU found"}
234
240
  )
235
241
 
236
- def get_amd_dgpu_devices(self):
242
+ def get_amd_dgpu_devices(self, include_inference_engines: bool = False):
237
243
  """
238
244
  Retrieves AMD discrete GPU device information using keyword-based classification.
239
245
 
240
246
  Returns:
241
247
  list: List of AMD dGPU device information.
242
248
  """
243
- dgpu_devices = self._detect_amd_gpus("discrete")
249
+ dgpu_devices = self._detect_amd_gpus(
250
+ "discrete", include_inference_engines=include_inference_engines
251
+ )
244
252
  return (
245
253
  dgpu_devices
246
254
  if dgpu_devices
247
- else {"available": False, "error": "No AMD discrete GPU found"}
255
+ else [{"available": False, "error": "No AMD discrete GPU found"}]
248
256
  )
249
257
 
250
258
  def get_npu_device(self) -> dict:
@@ -267,7 +275,9 @@ class WindowsSystemInfo(SystemInfo):
267
275
  }
268
276
 
269
277
  # Add inference engine detection
270
- npu_info["inference_engines"] = self._detect_inference_engines("npu")
278
+ npu_info["inference_engines"] = self._detect_inference_engines(
279
+ "npu", "AMD NPU"
280
+ )
271
281
  return npu_info
272
282
  except Exception as e: # pylint: disable=broad-except
273
283
  return {"available": False, "error": f"NPU detection failed: {e}"}
@@ -438,12 +448,13 @@ class WindowsSystemInfo(SystemInfo):
438
448
  info_dict["Windows Power Setting"] = self.get_windows_power_setting()
439
449
  return info_dict
440
450
 
441
- def _detect_inference_engines(self, device_type: str) -> dict:
451
+ def _detect_inference_engines(self, device_type: str, device_name: str) -> dict:
442
452
  """
443
453
  Detect available inference engines for a specific device type.
444
454
 
445
455
  Args:
446
456
  device_type: Device type ("cpu", "amd_igpu", "amd_dgpu", "npu")
457
+ device_name: Device name
447
458
 
448
459
  Returns:
449
460
  dict: Available inference engines and their information.
@@ -451,7 +462,7 @@ class WindowsSystemInfo(SystemInfo):
451
462
  try:
452
463
  from .inference_engines import detect_inference_engines
453
464
 
454
- return detect_inference_engines(device_type)
465
+ return detect_inference_engines(device_type, device_name)
455
466
  except Exception as e: # pylint: disable=broad-except
456
467
  return {"error": f"Inference engine detection failed: {str(e)}"}
457
468
 
@@ -467,13 +478,13 @@ class WSLSystemInfo(SystemInfo):
467
478
  """
468
479
  return {"available": False, "error": "Device detection not supported in WSL"}
469
480
 
470
- def get_amd_igpu_device(self) -> dict:
481
+ def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
471
482
  """
472
483
  Retrieves AMD integrated GPU device information in WSL environment.
473
484
  """
474
485
  return {"available": False, "error": "GPU detection not supported in WSL"}
475
486
 
476
- def get_amd_dgpu_devices(self) -> list:
487
+ def get_amd_dgpu_devices(self, include_inference_engines: bool = False) -> list:
477
488
  """
478
489
  Retrieves AMD discrete GPU device information in WSL environment.
479
490
  """
@@ -556,6 +567,7 @@ class LinuxSystemInfo(SystemInfo):
556
567
  cpu_data["architecture"] = line.split(":")[1].strip()
557
568
 
558
569
  if "name" in cpu_data:
570
+ cpu_name = cpu_data.get("name", "Unknown")
559
571
  cpu_info = {
560
572
  "name": cpu_data.get("name", "Unknown"),
561
573
  "cores": cpu_data.get("cores", "Unknown"),
@@ -565,14 +577,16 @@ class LinuxSystemInfo(SystemInfo):
565
577
  }
566
578
 
567
579
  # Add inference engine detection
568
- cpu_info["inference_engines"] = self._detect_inference_engines("cpu")
580
+ cpu_info["inference_engines"] = self._detect_inference_engines(
581
+ "cpu", cpu_name
582
+ )
569
583
  return cpu_info
570
584
  except Exception as e: # pylint: disable=broad-except
571
585
  return {"available": False, "error": f"CPU detection failed: {e}"}
572
586
 
573
587
  return {"available": False, "error": "No CPU information found"}
574
588
 
575
- def _detect_amd_gpus(self, gpu_type: str):
589
+ def _detect_amd_gpus(self, gpu_type: str, include_inference_engines: bool = False):
576
590
  """
577
591
  Shared AMD GPU detection logic for both integrated and discrete GPUs.
578
592
  Uses keyword-based classification for simplicity and reliability.
@@ -611,11 +625,10 @@ class LinuxSystemInfo(SystemInfo):
611
625
  "name": device_name,
612
626
  "available": True,
613
627
  }
614
-
615
- # Add inference engine detection
616
- gpu_info["inference_engines"] = self._detect_inference_engines(
617
- device_type
618
- )
628
+ if include_inference_engines:
629
+ gpu_info["inference_engines"] = (
630
+ self._detect_inference_engines(device_type, device_name)
631
+ )
619
632
  gpu_devices.append(gpu_info)
620
633
 
621
634
  except Exception as e: # pylint: disable=broad-except
@@ -624,32 +637,36 @@ class LinuxSystemInfo(SystemInfo):
624
637
 
625
638
  return gpu_devices
626
639
 
627
- def get_amd_igpu_device(self) -> dict:
640
+ def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
628
641
  """
629
642
  Retrieves AMD integrated GPU device information using keyword-based classification.
630
643
 
631
644
  Returns:
632
645
  dict: AMD iGPU device information.
633
646
  """
634
- igpu_devices = self._detect_amd_gpus("integrated")
647
+ igpu_devices = self._detect_amd_gpus(
648
+ "integrated", include_inference_engines=include_inference_engines
649
+ )
635
650
  return (
636
651
  igpu_devices[0]
637
652
  if igpu_devices
638
653
  else {"available": False, "error": "No AMD integrated GPU found"}
639
654
  )
640
655
 
641
- def get_amd_dgpu_devices(self):
656
+ def get_amd_dgpu_devices(self, include_inference_engines: bool = False) -> list:
642
657
  """
643
658
  Retrieves AMD discrete GPU device information using keyword-based classification.
644
659
 
645
660
  Returns:
646
661
  list: List of AMD dGPU device information.
647
662
  """
648
- dgpu_devices = self._detect_amd_gpus("discrete")
663
+ dgpu_devices = self._detect_amd_gpus(
664
+ "discrete", include_inference_engines=include_inference_engines
665
+ )
649
666
  return (
650
667
  dgpu_devices
651
668
  if dgpu_devices
652
- else {"available": False, "error": "No AMD discrete GPU found"}
669
+ else [{"available": False, "error": "No AMD discrete GPU found"}]
653
670
  )
654
671
 
655
672
  def get_npu_device(self) -> dict:
@@ -741,7 +758,7 @@ class LinuxSystemInfo(SystemInfo):
741
758
  info_dict["Physical Memory"] = self.get_physical_memory()
742
759
  return info_dict
743
760
 
744
- def _detect_inference_engines(self, device_type: str) -> dict:
761
+ def _detect_inference_engines(self, device_type: str, device_name: str) -> dict:
745
762
  """
746
763
  Detect available inference engines for a specific device type.
747
764
 
@@ -752,7 +769,7 @@ class LinuxSystemInfo(SystemInfo):
752
769
  dict: Available inference engines and their information.
753
770
  """
754
771
  try:
755
- return detect_inference_engines(device_type)
772
+ return detect_inference_engines(device_type, device_name)
756
773
  except Exception as e: # pylint: disable=broad-except
757
774
  return {"error": f"Inference engine detection failed: {str(e)}"}
758
775
 
@@ -771,7 +788,7 @@ class UnsupportedOSSystemInfo(SystemInfo):
771
788
  "error": "Device detection not supported on this operating system",
772
789
  }
773
790
 
774
- def get_amd_igpu_device(self) -> dict:
791
+ def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
775
792
  """
776
793
  Retrieves AMD integrated GPU device information for unsupported OS.
777
794
  """
@@ -780,7 +797,7 @@ class UnsupportedOSSystemInfo(SystemInfo):
780
797
  "error": "Device detection not supported on this operating system",
781
798
  }
782
799
 
783
- def get_amd_dgpu_devices(self) -> list:
800
+ def get_amd_dgpu_devices(self, include_inference_engines: bool = False) -> list:
784
801
  """
785
802
  Retrieves AMD discrete GPU device information for unsupported OS.
786
803
  """
@@ -68,7 +68,9 @@ class LlamaCppBench(Bench):
68
68
  # and error handling
69
69
  model.time_to_first_token = None
70
70
  model.tokens_per_second = None
71
- raw_output, stderr = model.generate(prompt, return_raw=True)
71
+ raw_output, stderr = model.generate(
72
+ prompt, max_new_tokens=output_tokens, return_raw=True
73
+ )
72
74
 
73
75
  if model.time_to_first_token is None or model.tokens_per_second is None:
74
76
  error_msg = (
@@ -65,6 +65,13 @@ class LoadLlamaCpp(FirstTool):
65
65
  help="Set this flag to indicate the model is a reasoning model",
66
66
  )
67
67
 
68
+ parser.add_argument(
69
+ "--backend",
70
+ choices=["vulkan", "rocm"],
71
+ default="vulkan",
72
+ help="Backend to use for llama.cpp (default: vulkan)",
73
+ )
74
+
68
75
  return parser
69
76
 
70
77
  def run(
@@ -76,6 +83,7 @@ class LoadLlamaCpp(FirstTool):
76
83
  threads: int = 1,
77
84
  output_tokens: int = 512,
78
85
  reasoning: bool = False,
86
+ backend: str = "vulkan",
79
87
  ) -> State:
80
88
  """
81
89
  Load a llama.cpp model
@@ -93,8 +101,7 @@ class LoadLlamaCpp(FirstTool):
93
101
  LlamaCppAdapter,
94
102
  )
95
103
 
96
- # Validate and install llama.cpp, if needed
97
- install_llamacpp()
104
+ install_llamacpp(backend)
98
105
 
99
106
  # Check if input is a local folder containing a .GGUF model
100
107
  if os.path.isdir(input):
@@ -153,7 +160,7 @@ class LoadLlamaCpp(FirstTool):
153
160
  full_model_path = snapshot_files["variant"]
154
161
  model_to_use = os.path.basename(full_model_path)
155
162
 
156
- llama_cli_exe_path = get_llama_cli_exe_path()
163
+ llama_cli_exe_path = get_llama_cli_exe_path(backend)
157
164
  printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
158
165
 
159
166
  # Get the directory containing the executable for shared libraries
@@ -175,7 +182,9 @@ class LoadLlamaCpp(FirstTool):
175
182
 
176
183
  # Save initial stats
177
184
  state.save_stat(Keys.DEVICE, device)
178
- state.save_stat(Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version())
185
+ state.save_stat(
186
+ Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
187
+ )
179
188
 
180
189
  status.add_to_state(state=state, name=input, model=model_to_use)
181
190
  return state