lemonade-sdk 8.0.5__tar.gz → 8.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (79) hide show
  1. {lemonade_sdk-8.0.5/src/lemonade_sdk.egg-info → lemonade_sdk-8.0.6}/PKG-INFO +3 -3
  2. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/README.md +1 -1
  3. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/setup.py +1 -1
  4. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/cache.py +3 -1
  5. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/adapter.py +6 -0
  6. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/huggingface/utils.py +6 -5
  7. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/llamacpp/bench.py +26 -46
  8. lemonade_sdk-8.0.6/src/lemonade/tools/llamacpp/load.py +185 -0
  9. lemonade_sdk-8.0.6/src/lemonade/tools/llamacpp/utils.py +612 -0
  10. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/oga/bench.py +5 -6
  11. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/oga/utils.py +8 -2
  12. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/prompt.py +17 -25
  13. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/report/table.py +12 -9
  14. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/llamacpp.py +80 -92
  15. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/serve.py +3 -0
  16. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/static/styles.css +116 -20
  17. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/static/webapp.html +11 -6
  18. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/tray.py +7 -0
  19. lemonade_sdk-8.0.6/src/lemonade/version.py +1 -0
  20. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6/src/lemonade_sdk.egg-info}/PKG-INFO +3 -3
  21. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_sdk.egg-info/SOURCES.txt +1 -0
  22. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_sdk.egg-info/requires.txt +1 -1
  23. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_server/model_manager.py +4 -148
  24. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_server/server_models.json +11 -0
  25. lemonade_sdk-8.0.5/src/lemonade/tools/llamacpp/load.py +0 -277
  26. lemonade_sdk-8.0.5/src/lemonade/version.py +0 -1
  27. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/LICENSE +0 -0
  28. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/NOTICE.md +0 -0
  29. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/setup.cfg +0 -0
  30. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/__init__.py +0 -0
  31. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/api.py +0 -0
  32. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/cli.py +0 -0
  33. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/__init__.py +0 -0
  34. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/build.py +0 -0
  35. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/cli_helpers.py +0 -0
  36. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/exceptions.py +0 -0
  37. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/filesystem.py +0 -0
  38. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/inference_engines.py +0 -0
  39. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/network.py +0 -0
  40. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/printing.py +0 -0
  41. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/status.py +0 -0
  42. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/system_info.py +0 -0
  43. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/common/test_helpers.py +0 -0
  44. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/profilers/__init__.py +0 -0
  45. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/profilers/memory_tracker.py +0 -0
  46. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/profilers/profiler.py +0 -0
  47. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/sequence.py +0 -0
  48. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/state.py +0 -0
  49. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/__init__.py +0 -0
  50. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/accuracy.py +0 -0
  51. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/bench.py +0 -0
  52. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/huggingface/bench.py +0 -0
  53. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/huggingface/load.py +0 -0
  54. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/humaneval.py +0 -0
  55. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/management_tools.py +0 -0
  56. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/mmlu.py +0 -0
  57. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/oga/__init__.py +0 -0
  58. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/oga/load.py +0 -0
  59. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/perplexity.py +0 -0
  60. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/quark/__init__.py +0 -0
  61. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/quark/quark_load.py +0 -0
  62. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/quark/quark_quantize.py +0 -0
  63. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/report/__init__.py +0 -0
  64. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/report/llm_report.py +0 -0
  65. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/__init__.py +0 -0
  66. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/static/favicon.ico +0 -0
  67. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/tool_calls.py +0 -0
  68. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/utils/port.py +0 -0
  69. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/utils/system_tray.py +0 -0
  70. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/utils/thread.py +0 -0
  71. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/server/webapp.py +0 -0
  72. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade/tools/tool.py +0 -0
  73. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_install/__init__.py +0 -0
  74. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_install/install.py +0 -0
  75. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  76. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  77. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
  78. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_server/cli.py +0 -0
  79. {lemonade_sdk-8.0.5 → lemonade_sdk-8.0.6}/src/lemonade_server/pydantic_models.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.0.5
3
+ Version: 8.0.6
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.13
@@ -23,7 +23,7 @@ Requires-Dist: zstandard
23
23
  Requires-Dist: fastapi
24
24
  Requires-Dist: uvicorn[standard]
25
25
  Requires-Dist: openai>=1.81.0
26
- Requires-Dist: transformers<=4.51.3
26
+ Requires-Dist: transformers<=4.53.2
27
27
  Requires-Dist: jinja2
28
28
  Requires-Dist: tabulate
29
29
  Requires-Dist: sentencepiece
@@ -284,7 +284,7 @@ New contributors can find beginner-friendly issues tagged with "Good First Issue
284
284
 
285
285
  ## Maintainers
286
286
 
287
- This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), email [lemonade@amd.com](mailto:lemonade@amd.com), or join our [Discord](https://discord.gg/5xXzkMu8Zk).
287
+ This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), emailing [lemonade@amd.com](mailto:lemonade@amd.com), or joining our [Discord](https://discord.gg/5xXzkMu8Zk).
288
288
 
289
289
  ## License
290
290
 
@@ -202,7 +202,7 @@ New contributors can find beginner-friendly issues tagged with "Good First Issue
202
202
 
203
203
  ## Maintainers
204
204
 
205
- This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), email [lemonade@amd.com](mailto:lemonade@amd.com), or join our [Discord](https://discord.gg/5xXzkMu8Zk).
205
+ This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), emailing [lemonade@amd.com](mailto:lemonade@amd.com), or joining our [Discord](https://discord.gg/5xXzkMu8Zk).
206
206
 
207
207
  ## License
208
208
 
@@ -45,7 +45,7 @@ setup(
45
45
  "fastapi",
46
46
  "uvicorn[standard]",
47
47
  "openai>=1.81.0",
48
- "transformers<=4.51.3",
48
+ "transformers<=4.53.2",
49
49
  "jinja2",
50
50
  "tabulate",
51
51
  "sentencepiece",
@@ -34,7 +34,7 @@ def build_name(input_name):
34
34
  """
35
35
  Name the lemonade build by concatenating these two factors:
36
36
  1. Sanitize the input name (typically a model checkpoint name) by
37
- replacing any `/` characters with `_`.
37
+ replacing any `/` characters with `_` and ':' characters with '-'.
38
38
  2. Timestamp to ensure that builds in the same cache will not
39
39
  collide in the same build directory.
40
40
 
@@ -47,6 +47,7 @@ def build_name(input_name):
47
47
  else:
48
48
  # Sanitize the input name
49
49
  input_name_sanitized = input_name.replace("/", "_")
50
+ input_name_sanitized = input_name_sanitized.replace(":", "-")
50
51
 
51
52
  # Get the formatted timestamp string
52
53
  timestamp = get_timestamp()
@@ -79,6 +80,7 @@ class Keys:
79
80
  MAX_MEMORY_USED_GB = "max_memory_used_GB"
80
81
  MAX_MEMORY_USED_GBYTE = "max_memory_used_gbyte"
81
82
  RYZEN_AI_VERSION_INFO = "ryzen_ai_version_info"
83
+ LLAMA_CLI_VERSION_INFO = "llama_cli_version_info"
82
84
 
83
85
 
84
86
  # This file was originally licensed under Apache 2.0. It has been modified.
@@ -13,6 +13,9 @@ class ModelAdapter(abc.ABC):
13
13
  """
14
14
  self.tokens_per_second = None
15
15
  self.time_to_first_token = None
16
+ self.prompt_tokens = None
17
+ self.response_tokens = None
18
+
16
19
  self.type = "generic"
17
20
 
18
21
  @abc.abstractmethod
@@ -22,6 +25,9 @@ class ModelAdapter(abc.ABC):
22
25
 
23
26
  We try to keep the signature here minimal to allow for maximum compatibility
24
27
  with recipe components, which themselves may not support a lot of arguments.
28
+
29
+ The generate method should store prompt and response lengths (in tokens)
30
+ in the prompt_tokens and response_tokens members.
25
31
  """
26
32
 
27
33
 
@@ -108,7 +108,9 @@ class HuggingfaceAdapter(ModelAdapter):
108
108
  with torch.no_grad(), torch.inference_mode():
109
109
  outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
110
110
 
111
- return outputs
111
+ self.prompt_tokens = input_ids.shape[1]
112
+ self.response_tokens = len(outputs[0]) - self.prompt_tokens
113
+ return outputs
112
114
 
113
115
  def _model_call(self, input_tensor):
114
116
  """Forward pass through the model to get logits
@@ -341,12 +343,11 @@ def benchmark_huggingface_llm(
341
343
 
342
344
  latency = end_time - start_time
343
345
 
344
- token_len = outputs.shape[1] - input_ids.shape[1]
345
- tokens_out_len_list.append(token_len)
346
+ tokens_out_len_list.append(model.response_tokens)
346
347
 
347
348
  # Only count an iteration if it produced enough tokens
348
- if token_len >= target_output_tokens:
349
- per_iteration_result.append((latency, token_len))
349
+ if model.response_tokens >= target_output_tokens:
350
+ per_iteration_result.append((latency, model.response_tokens))
350
351
 
351
352
  report_progress_fn(
352
353
  (warmup_iterations + count + 1) / (warmup_iterations + iterations)
@@ -3,27 +3,31 @@ import statistics
3
3
  from statistics import StatisticsError
4
4
  from lemonade.state import State
5
5
  from lemonade.cache import Keys
6
- from lemonade.tools.llamacpp.load import LlamaCppAdapter
6
+ from lemonade.tools.llamacpp.utils import LlamaCppAdapter
7
7
  from lemonade.tools.bench import Bench
8
8
 
9
9
 
10
10
  class LlamaCppBench(Bench):
11
+ """
12
+ Benchmark a llama.cpp model
13
+ """
11
14
 
12
- unique_name = "llama-cpp-bench"
15
+ unique_name = "llamacpp-bench"
13
16
 
14
17
  def __init__(self):
15
18
  super().__init__()
16
19
 
17
20
  # Additional statistics generated by this bench tool
18
- self.status_stats += [
21
+ self.status_stats.insert(
22
+ self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
19
23
  Keys.STD_DEV_TOKENS_PER_SECOND,
20
- ]
24
+ )
21
25
  self.std_dev_token_generation_tokens_per_second_list = []
22
26
 
23
27
  @staticmethod
24
28
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
25
29
  parser = __class__.helpful_parser(
26
- short_description="Benchmark a llama.cpp model",
30
+ short_description="Benchmark an LLM in llama.cpp",
27
31
  add_help=add_help,
28
32
  )
29
33
 
@@ -53,38 +57,20 @@ class LlamaCppBench(Bench):
53
57
  f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
54
58
  "loaded first. Please run load-llama-cpp before this tool."
55
59
  )
60
+ model: LlamaCppAdapter = state.model
56
61
 
57
- iteration_tokens_per_second = []
58
- iteration_time_to_first_token = []
62
+ per_iteration_tokens_per_second = []
63
+ per_iteration_time_to_first_token = []
59
64
 
60
65
  for iteration in range(iterations + warmup_iterations):
61
66
  try:
62
67
  # Use the adapter's generate method which already has the timeout
63
68
  # and error handling
64
- raw_output, stderr = state.model.generate(prompt, return_raw=True)
65
-
66
- # Parse the timing information from the output
67
- ms_per_token = None
68
- time_to_first_token_ms = None
69
- input_tokens = None
70
-
71
- # Look for timing in both stdout and stderr
72
- for output in [raw_output, stderr]:
73
- for line in output.splitlines():
74
- if "llama_perf_context_print: eval time =" in line:
75
- parts = line.split("(")[1].strip()
76
- parts = parts.split(",")
77
- ms_per_token = float(
78
- parts[0].split("ms per token")[0].strip()
79
- )
80
- if "llama_perf_context_print: prompt eval time =" in line:
81
- parts = line.split("=")[1].split("/")
82
- time_to_first_token_ms = float(
83
- parts[0].split("ms")[0].strip()
84
- )
85
- input_tokens = int(parts[1].split("tokens")[0].strip())
86
-
87
- if ms_per_token is None or time_to_first_token_ms is None:
69
+ model.time_to_first_token = None
70
+ model.tokens_per_second = None
71
+ raw_output, stderr = model.generate(prompt, return_raw=True)
72
+
73
+ if model.time_to_first_token is None or model.tokens_per_second is None:
88
74
  error_msg = (
89
75
  "Could not find timing information in llama.cpp output.\n"
90
76
  )
@@ -92,17 +78,11 @@ class LlamaCppBench(Bench):
92
78
  error_msg += "Stderr:\n" + stderr
93
79
  raise Exception(error_msg)
94
80
 
95
- # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0
96
- # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
97
- # as performance data for generating a few tokens is not relevant.
98
- tokens_per_second = 0
99
- if output_tokens > 5 and ms_per_token > 0:
100
- tokens_per_second = 1000 / ms_per_token
101
- time_to_first_token = time_to_first_token_ms / 1000
81
+ self.tokens_out_len_list.append(model.response_tokens)
102
82
 
103
83
  if iteration > warmup_iterations - 1:
104
- iteration_tokens_per_second.append(tokens_per_second)
105
- iteration_time_to_first_token.append(time_to_first_token)
84
+ per_iteration_tokens_per_second.append(model.tokens_per_second)
85
+ per_iteration_time_to_first_token.append(model.time_to_first_token)
106
86
 
107
87
  report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
108
88
 
@@ -110,25 +90,25 @@ class LlamaCppBench(Bench):
110
90
  error_msg = f"Failed to run benchmark: {str(e)}"
111
91
  raise Exception(error_msg)
112
92
 
113
- self.input_ids_len_list.append(input_tokens)
114
- mean_time_to_first_token = statistics.mean(iteration_time_to_first_token)
93
+ self.input_ids_len_list.append(model.prompt_tokens)
94
+ mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
115
95
  self.mean_time_to_first_token_list.append(mean_time_to_first_token)
116
96
  self.prefill_tokens_per_second_list.append(
117
- input_tokens / mean_time_to_first_token
97
+ model.prompt_tokens / mean_time_to_first_token
118
98
  )
119
99
  self.token_generation_tokens_per_second_list.append(
120
- statistics.mean(iteration_tokens_per_second)
100
+ statistics.mean(per_iteration_tokens_per_second)
121
101
  )
122
102
  try:
123
103
  self.std_dev_time_to_first_token_list.append(
124
- statistics.stdev(iteration_time_to_first_token)
104
+ statistics.stdev(per_iteration_time_to_first_token)
125
105
  )
126
106
  except StatisticsError:
127
107
  # Less than 2 measurements
128
108
  self.std_dev_time_to_first_token_list.append(None)
129
109
  try:
130
110
  self.std_dev_token_generation_tokens_per_second_list.append(
131
- statistics.stdev(iteration_tokens_per_second)
111
+ statistics.stdev(per_iteration_tokens_per_second)
132
112
  )
133
113
  except StatisticsError:
134
114
  # Less than 2 measurements
@@ -0,0 +1,185 @@
1
+ import argparse
2
+ import os
3
+ import lemonade.common.printing as printing
4
+ import lemonade.common.status as status
5
+ from lemonade.state import State
6
+ from lemonade.tools import FirstTool
7
+ from lemonade.cache import Keys
8
+
9
+
10
+ class LoadLlamaCpp(FirstTool):
11
+ unique_name = "llamacpp-load"
12
+
13
+ def __init__(self):
14
+ super().__init__(monitor_message="Loading llama.cpp model")
15
+
16
+ self.status_stats = [
17
+ Keys.DEVICE,
18
+ ]
19
+
20
+ @staticmethod
21
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
22
+ parser = __class__.helpful_parser(
23
+ short_description="Wrap llama.cpp models with an API",
24
+ add_help=add_help,
25
+ )
26
+
27
+ parser.add_argument(
28
+ "-d",
29
+ "--device",
30
+ choices=["cpu", "igpu"],
31
+ default="igpu",
32
+ help="Which device to load the model on to (default: igpu)",
33
+ )
34
+
35
+ default_threads = -1
36
+ parser.add_argument(
37
+ "--threads",
38
+ required=False,
39
+ type=int,
40
+ default=default_threads,
41
+ help=f"Number of threads to use during generation (default: {default_threads})",
42
+ )
43
+
44
+ context_size = 4096
45
+ parser.add_argument(
46
+ "--context-size",
47
+ required=False,
48
+ type=int,
49
+ default=context_size,
50
+ help=f"Size of the prompt context (default: {context_size}. 0 = loaded from model)",
51
+ )
52
+
53
+ output_tokens = 512
54
+ parser.add_argument(
55
+ "--output-tokens",
56
+ required=False,
57
+ type=int,
58
+ default=output_tokens,
59
+ help=f"Maximum number of output tokens to generate (default: {output_tokens})",
60
+ )
61
+
62
+ parser.add_argument(
63
+ "--reasoning",
64
+ action="store_true",
65
+ help="Set this flag to indicate the model is a reasoning model",
66
+ )
67
+
68
+ return parser
69
+
70
+ def run(
71
+ self,
72
+ state: State,
73
+ input: str = "",
74
+ device: str = "igpu",
75
+ context_size: int = 512,
76
+ threads: int = 1,
77
+ output_tokens: int = 512,
78
+ reasoning: bool = False,
79
+ ) -> State:
80
+ """
81
+ Load a llama.cpp model
82
+ """
83
+
84
+ from lemonade.common.network import is_offline
85
+ from lemonade.tools.llamacpp.utils import (
86
+ install_llamacpp,
87
+ get_llama_cli_exe_path,
88
+ get_llama_installed_version,
89
+ parse_checkpoint,
90
+ download_gguf,
91
+ get_local_checkpoint_path,
92
+ LlamaCppTokenizerAdapter,
93
+ LlamaCppAdapter,
94
+ )
95
+
96
+ # Validate and install llama.cpp, if needed
97
+ install_llamacpp()
98
+
99
+ # Check if input is a local folder containing a .GGUF model
100
+ if os.path.isdir(input):
101
+ # input is a local folder
102
+ local_model_folder = os.path.abspath(input)
103
+ checkpoint = "local_model"
104
+ state.checkpoint = checkpoint
105
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
106
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, local_model_folder)
107
+
108
+ # See if there is a file ending in ".gguf" in this folder
109
+ dir = os.listdir(input)
110
+ gguf_files = [filename for filename in dir if filename.endswith(".gguf")]
111
+ if len(gguf_files) == 0:
112
+ raise ValueError(
113
+ f"The folder {input} does not contain a GGUF model file."
114
+ )
115
+ model_to_use = gguf_files[0]
116
+ full_model_path = os.path.join(local_model_folder, model_to_use)
117
+
118
+ else:
119
+ # Input is a model checkpoint
120
+ checkpoint = input
121
+ state.checkpoint = checkpoint
122
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
123
+
124
+ # Make sure that a variant is provided for the GGUF model
125
+ base_checkpoint, variant = parse_checkpoint(checkpoint)
126
+ if variant is None:
127
+ raise ValueError(
128
+ "You are required to provide a 'variant' when "
129
+ "selecting a GGUF model. The variant is provided "
130
+ "as CHECKPOINT:VARIANT. For example: "
131
+ "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
132
+ "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
133
+ )
134
+
135
+ # Auto-detect offline status
136
+ offline = is_offline()
137
+ if offline:
138
+ printing.log_warning(
139
+ "Network connectivity to huggingface.co not detected. Running in offline mode."
140
+ )
141
+ full_model_path, model_to_use = get_local_checkpoint_path(
142
+ base_checkpoint, variant
143
+ )
144
+ if not full_model_path:
145
+ raise ValueError(
146
+ f"Model {checkpoint} is not available locally."
147
+ f"Cannot download in offline mode."
148
+ )
149
+
150
+ else:
151
+
152
+ snapshot_files = download_gguf(checkpoint)
153
+ full_model_path = snapshot_files["variant"]
154
+ model_to_use = os.path.basename(full_model_path)
155
+
156
+ llama_cli_exe_path = get_llama_cli_exe_path()
157
+ printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
158
+
159
+ # Get the directory containing the executable for shared libraries
160
+ lib_dir = os.path.dirname(llama_cli_exe_path)
161
+
162
+ # Pass the model and inputs into state
163
+ state.model = LlamaCppAdapter(
164
+ model=full_model_path,
165
+ device=device,
166
+ output_tokens=output_tokens,
167
+ context_size=context_size,
168
+ threads=threads,
169
+ executable=llama_cli_exe_path,
170
+ reasoning=reasoning,
171
+ lib_dir=lib_dir,
172
+ )
173
+ state.tokenizer = LlamaCppTokenizerAdapter()
174
+ state.device = device
175
+
176
+ # Save initial stats
177
+ state.save_stat(Keys.DEVICE, device)
178
+ state.save_stat(Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version())
179
+
180
+ status.add_to_state(state=state, name=input, model=model_to_use)
181
+ return state
182
+
183
+
184
+ # This file was originally licensed under Apache 2.0. It has been modified.
185
+ # Modifications Copyright (c) 2025 AMD