lemonade-sdk 8.1.11__tar.gz → 8.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (93) hide show
  1. {lemonade_sdk-8.1.11/src/lemonade_sdk.egg-info → lemonade_sdk-8.2.0}/PKG-INFO +5 -3
  2. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/README.md +1 -0
  3. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/setup.py +3 -2
  4. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/cache.py +6 -1
  5. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/status.py +4 -4
  6. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/system_info.py +0 -26
  7. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/bench.py +22 -1
  8. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/flm/utils.py +70 -22
  9. lemonade_sdk-8.2.0/src/lemonade/tools/llamacpp/bench.py +224 -0
  10. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/llamacpp/load.py +30 -2
  11. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/llamacpp/utils.py +234 -15
  12. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/oga/bench.py +0 -26
  13. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/oga/load.py +38 -142
  14. lemonade_sdk-8.2.0/src/lemonade/tools/oga/migration.py +403 -0
  15. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/report/table.py +6 -0
  16. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/flm.py +2 -6
  17. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/llamacpp.py +20 -1
  18. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/serve.py +335 -17
  19. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/js/models.js +416 -18
  20. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/js/shared.js +44 -6
  21. lemonade_sdk-8.2.0/src/lemonade/tools/server/static/logs.html +57 -0
  22. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/styles.css +204 -0
  23. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/webapp.html +32 -0
  24. lemonade_sdk-8.2.0/src/lemonade/version.py +1 -0
  25. lemonade_sdk-8.2.0/src/lemonade_install/install.py +239 -0
  26. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0/src/lemonade_sdk.egg-info}/PKG-INFO +5 -3
  27. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/SOURCES.txt +1 -0
  28. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/requires.txt +3 -2
  29. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/cli.py +10 -0
  30. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/model_manager.py +172 -11
  31. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/server_models.json +102 -66
  32. lemonade_sdk-8.1.11/src/lemonade/tools/llamacpp/bench.py +0 -136
  33. lemonade_sdk-8.1.11/src/lemonade/tools/server/static/logs.html +0 -47
  34. lemonade_sdk-8.1.11/src/lemonade/version.py +0 -1
  35. lemonade_sdk-8.1.11/src/lemonade_install/install.py +0 -785
  36. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/LICENSE +0 -0
  37. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/NOTICE.md +0 -0
  38. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/pyproject.toml +0 -0
  39. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/setup.cfg +0 -0
  40. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/__init__.py +0 -0
  41. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/api.py +0 -0
  42. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/cli.py +0 -0
  43. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/__init__.py +0 -0
  44. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/build.py +0 -0
  45. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/cli_helpers.py +0 -0
  46. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/exceptions.py +0 -0
  47. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/filesystem.py +0 -0
  48. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/inference_engines.py +0 -0
  49. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/network.py +0 -0
  50. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/printing.py +0 -0
  51. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/test_helpers.py +0 -0
  52. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/__init__.py +0 -0
  53. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/agt_power.py +0 -0
  54. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/hwinfo_power.py +0 -0
  55. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/memory_tracker.py +0 -0
  56. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/profiler.py +0 -0
  57. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/sequence.py +0 -0
  58. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/state.py +0 -0
  59. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/__init__.py +0 -0
  60. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/accuracy.py +0 -0
  61. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/adapter.py +0 -0
  62. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/flm/__init__.py +0 -0
  63. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/huggingface/bench.py +0 -0
  64. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/huggingface/load.py +0 -0
  65. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/huggingface/utils.py +0 -0
  66. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/humaneval.py +0 -0
  67. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/management_tools.py +0 -0
  68. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/mmlu.py +0 -0
  69. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/oga/__init__.py +0 -0
  70. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/oga/utils.py +0 -0
  71. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/perplexity.py +0 -0
  72. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/prompt.py +0 -0
  73. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/report/__init__.py +0 -0
  74. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/report/llm_report.py +0 -0
  75. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/__init__.py +0 -0
  76. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/favicon.ico +0 -0
  77. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/js/chat.js +0 -0
  78. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/js/model-settings.js +0 -0
  79. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/tool_calls.py +0 -0
  80. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/tray.py +0 -0
  81. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/utils/macos_tray.py +0 -0
  82. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/utils/port.py +0 -0
  83. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/utils/thread.py +0 -0
  84. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/utils/windows_tray.py +0 -0
  85. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/webapp.py +0 -0
  86. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/wrapped_server.py +0 -0
  87. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/tool.py +0 -0
  88. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_install/__init__.py +0 -0
  89. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  90. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  91. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
  92. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/pydantic_models.py +0 -0
  93. {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.1.11
3
+ Version: 8.2.0
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.14
@@ -29,12 +29,13 @@ Requires-Dist: tabulate
29
29
  Requires-Dist: sentencepiece
30
30
  Requires-Dist: huggingface-hub[hf_xet]==0.33.0
31
31
  Requires-Dist: python-dotenv
32
+ Requires-Dist: python-multipart
32
33
  Requires-Dist: rumps>=0.4.0; sys_platform == "darwin"
33
34
  Provides-Extra: oga-ryzenai
34
- Requires-Dist: onnxruntime-genai-directml-ryzenai==0.7.0.2.1; extra == "oga-ryzenai"
35
+ Requires-Dist: onnxruntime-genai-directml-ryzenai==0.9.2; extra == "oga-ryzenai"
35
36
  Requires-Dist: protobuf>=6.30.1; extra == "oga-ryzenai"
36
37
  Provides-Extra: oga-cpu
37
- Requires-Dist: onnxruntime-genai==0.8.2; extra == "oga-cpu"
38
+ Requires-Dist: onnxruntime-genai==0.9.2; extra == "oga-cpu"
38
39
  Requires-Dist: onnxruntime>=1.22.0; extra == "oga-cpu"
39
40
  Provides-Extra: dev
40
41
  Requires-Dist: torch>=2.6.0; extra == "dev"
@@ -264,6 +265,7 @@ This project is:
264
265
  - [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
265
266
  - [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
266
267
  - [OpenAI API](https://github.com/openai/openai-python)
268
+ - [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
267
269
  - and more...
268
270
  - Accelerated by mentorship from the OCV Catalyst program.
269
271
  - Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).
@@ -207,6 +207,7 @@ This project is:
207
207
  - [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
208
208
  - [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
209
209
  - [OpenAI API](https://github.com/openai/openai-python)
210
+ - [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
210
211
  - and more...
211
212
  - Accelerated by mentorship from the OCV Catalyst program.
212
213
  - Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).
@@ -49,6 +49,7 @@ setup(
49
49
  "sentencepiece",
50
50
  "huggingface-hub[hf_xet]==0.33.0",
51
51
  "python-dotenv",
52
+ "python-multipart",
52
53
  # macOS-specific dependencies
53
54
  "rumps>=0.4.0; sys_platform == 'darwin'",
54
55
  ],
@@ -57,11 +58,11 @@ setup(
57
58
  # applications, without including developer-focused tools
58
59
  # Primary NPU extra using unified PyPI package
59
60
  "oga-ryzenai": [
60
- "onnxruntime-genai-directml-ryzenai==0.7.0.2.1",
61
+ "onnxruntime-genai-directml-ryzenai==0.9.2",
61
62
  "protobuf>=6.30.1",
62
63
  ],
63
64
  "oga-cpu": [
64
- "onnxruntime-genai==0.8.2",
65
+ "onnxruntime-genai==0.9.2",
65
66
  "onnxruntime >=1.22.0",
66
67
  ],
67
68
  # Developer-focused tools for benchmarking, accuracy testing, and
@@ -43,7 +43,11 @@ def build_name(input_name):
43
43
  """
44
44
 
45
45
  if os.path.isdir(input_name):
46
+ # Input is a folder so no good way to determine a model name
46
47
  input_name_sanitized = "local_model"
48
+ elif os.path.isfile(input_name):
49
+ # Use the filename without its extension
50
+ input_name_sanitized = os.path.splitext(os.path.basename(input_name))[0]
47
51
  else:
48
52
  # Sanitize the input name
49
53
  input_name_sanitized = input_name.replace("/", "_")
@@ -63,8 +67,9 @@ class Keys:
63
67
  TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
64
68
  STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
65
69
  SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
66
- PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
67
70
  STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
71
+ PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
72
+ STD_DEV_PREFILL_TOKENS_PER_SECOND = "std_dev_prefill_tokens_per_second"
68
73
  CHECKPOINT = "checkpoint"
69
74
  DTYPE = "dtype"
70
75
  PROMPT = "prompt"
@@ -112,10 +112,10 @@ class UniqueInvocationInfo(BasicInfo):
112
112
  if print_file_name:
113
113
  print(f"{self.script_name}{self.extension}:")
114
114
 
115
- # Print invocation about the model (only applies to scripts, not ONNX files or
115
+ # Print invocation about the model (only applies to scripts, not ONNX or GGUF files, nor
116
116
  # LLMs, which have no extension)
117
117
  if not (
118
- self.extension == ".onnx"
118
+ self.extension in [".onnx", ".gguf"]
119
119
  or self.extension == build.state_file_name
120
120
  or self.extension == ""
121
121
  ):
@@ -138,7 +138,7 @@ class UniqueInvocationInfo(BasicInfo):
138
138
 
139
139
  if self.depth == 0:
140
140
  print(f"{self.indent}\tLocation:\t{self.file}", end="")
141
- if self.extension == ".onnx":
141
+ if self.extension in [".onnx", ".gguf"]:
142
142
  print()
143
143
  else:
144
144
  print(f", line {self.line}")
@@ -314,7 +314,7 @@ class UniqueInvocationInfo(BasicInfo):
314
314
  Print information about a given model or submodel.
315
315
  """
316
316
 
317
- if self.extension == ".onnx" or self.extension == "":
317
+ if self.extension in [".onnx", ".gguf"] or self.extension == "":
318
318
  self.indent = "\t" * (2 * self.depth)
319
319
  else:
320
320
  self.indent = "\t" * (2 * self.depth + 1)
@@ -1110,32 +1110,6 @@ class LinuxSystemInfo(SystemInfo):
1110
1110
 
1111
1111
  return ""
1112
1112
 
1113
- def _get_nvidia_vram_smi_linux(self) -> float:
1114
- """
1115
- Get NVIDIA GPU VRAM on Linux using nvidia-smi command.
1116
-
1117
- Returns:
1118
- float: VRAM in GB, or 0.0 if detection fails
1119
- """
1120
- try:
1121
- output = (
1122
- subprocess.check_output(
1123
- "nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits",
1124
- shell=True,
1125
- stderr=subprocess.DEVNULL,
1126
- )
1127
- .decode()
1128
- .strip()
1129
- )
1130
-
1131
- # nvidia-smi returns memory in MB
1132
- vram_mb = int(output.split("\n")[0])
1133
- vram_gb = round(vram_mb / 1024, 1)
1134
- return vram_gb
1135
- except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
1136
- pass
1137
- return 0.0
1138
-
1139
1113
  @staticmethod
1140
1114
  def get_processor_name() -> str:
1141
1115
  """
@@ -29,7 +29,9 @@ class Bench(Tool, ABC):
29
29
  Keys.SECONDS_TO_FIRST_TOKEN,
30
30
  Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
31
31
  Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
32
+ Keys.STD_DEV_TOKENS_PER_SECOND,
32
33
  Keys.PREFILL_TOKENS_PER_SECOND,
34
+ Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
33
35
  Keys.PROMPT_TOKENS,
34
36
  Keys.RESPONSE_TOKENS,
35
37
  Keys.MAX_MEMORY_USED_GBYTE,
@@ -42,7 +44,9 @@ class Bench(Tool, ABC):
42
44
  self.mean_time_to_first_token_list = []
43
45
  self.std_dev_time_to_first_token_list = []
44
46
  self.prefill_tokens_per_second_list = []
47
+ self.std_dev_prefill_tokens_per_second_list = []
45
48
  self.token_generation_tokens_per_second_list = []
49
+ self.std_dev_token_generation_tokens_per_second_list = []
46
50
  self.max_memory_used_gb_list = []
47
51
 
48
52
  # Max memory used can only be measured on Windows systems
@@ -88,7 +92,7 @@ class Bench(Tool, ABC):
88
92
  default=[str(default_prompt_length)],
89
93
  metavar="PROMPT",
90
94
  help="Input one or more prompts to the LLM. Three formats are supported. "
91
- "1) integer: use a synthetic prompt with the specified length "
95
+ "1) integer: use a synthetic prompt with the specified token length "
92
96
  "2) str: use a user-provided prompt string "
93
97
  "3) path/to/prompt.txt: load the prompt from a text file. "
94
98
  f"(default: {default_prompt_length}) ",
@@ -246,10 +250,27 @@ class Bench(Tool, ABC):
246
250
  Keys.PREFILL_TOKENS_PER_SECOND,
247
251
  self.get_item_or_list(self.prefill_tokens_per_second_list),
248
252
  )
253
+ if not all(
254
+ element is None for element in self.std_dev_prefill_tokens_per_second_list
255
+ ):
256
+ state.save_stat(
257
+ Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
258
+ self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
259
+ )
249
260
  state.save_stat(
250
261
  Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
251
262
  self.get_item_or_list(self.token_generation_tokens_per_second_list),
252
263
  )
264
+ if not all(
265
+ element is None
266
+ for element in self.std_dev_token_generation_tokens_per_second_list
267
+ ):
268
+ state.save_stat(
269
+ Keys.STD_DEV_TOKENS_PER_SECOND,
270
+ self.get_item_or_list(
271
+ self.std_dev_token_generation_tokens_per_second_list
272
+ ),
273
+ )
253
274
  if self.save_max_memory_used:
254
275
  state.save_stat(
255
276
  Keys.MAX_MEMORY_USED_GBYTE,
@@ -10,16 +10,46 @@ import time
10
10
  from typing import List, Optional
11
11
 
12
12
  import requests
13
- from packaging.version import Version
13
+ from packaging.version import Version, InvalidVersion
14
14
 
15
15
 
16
- FLM_MINIMUM_VERSION = "0.9.10"
16
+ def get_flm_latest_version() -> Optional[str]:
17
+ """
18
+ Get and return the latest FLM version from "https://github.com/FastFlowLM/FastFlowLM/tags"
19
+ This uses the GitHub tags API.
20
+ """
21
+ url = "https://api.github.com/repos/FastFlowLM/FastFlowLM/tags"
22
+ try:
23
+ response = requests.get(url, timeout=10)
24
+ response.raise_for_status()
25
+ tags = response.json()
26
+ if not tags:
27
+ return None
28
+ # Tags are sorted in reverse chronological order; find the first that looks like a version
29
+ for tag in tags:
30
+ tag_name = tag.get("name", "")
31
+ # Accept tags of the form v0.9.10, 0.9.10, etc.
32
+ if tag_name.startswith("v"):
33
+ version_candidate = tag_name[1:]
34
+ else:
35
+ version_candidate = tag_name
36
+ try:
37
+ # validate it's a version string
38
+ _ = Version(version_candidate)
39
+ return version_candidate
40
+ except InvalidVersion:
41
+ continue
42
+ return None
43
+ except requests.exceptions.RequestException as e:
44
+ logging.debug("Error retrieving latest FLM version: %s", e)
45
+ return None
17
46
 
18
47
 
19
48
  def check_flm_version() -> Optional[str]:
20
49
  """
21
50
  Check if FLM is installed and return version, or None if not available.
22
51
  """
52
+ latest_version_str = get_flm_latest_version()
23
53
  try:
24
54
  result = subprocess.run(
25
55
  ["flm", "version"],
@@ -34,11 +64,11 @@ def check_flm_version() -> Optional[str]:
34
64
  output = result.stdout.strip()
35
65
  if output.startswith("FLM v"):
36
66
  version_str = output[5:] # Remove "FLM v" prefix
37
- return version_str
38
- return None
67
+ return version_str, latest_version_str
68
+ return None, latest_version_str
39
69
 
40
70
  except (subprocess.CalledProcessError, FileNotFoundError):
41
- return None
71
+ return None, latest_version_str
42
72
 
43
73
 
44
74
  def refresh_environment():
@@ -76,31 +106,42 @@ def install_flm():
76
106
  If not, download and run the GUI installer, then wait for completion.
77
107
  """
78
108
  # Check current FLM installation
79
- current_version = check_flm_version()
109
+ current_version, latest_version = check_flm_version()
80
110
 
81
- if current_version and Version(current_version) >= Version(FLM_MINIMUM_VERSION):
111
+ if (
112
+ current_version
113
+ and latest_version
114
+ and Version(current_version) == Version(latest_version)
115
+ ):
82
116
  logging.info(
83
- "FLM v%s is already installed and meets minimum version requirement (v%s)",
117
+ "FLM v%s is already installed and is up to date (latest version: v%s).",
84
118
  current_version,
85
- FLM_MINIMUM_VERSION,
119
+ latest_version,
86
120
  )
87
121
  return
88
122
 
89
123
  if current_version:
124
+ if not latest_version:
125
+ logging.info(
126
+ "Unable to detect the latest FLM version; continuing with installed FLM v%s.",
127
+ current_version,
128
+ )
129
+ return
90
130
  logging.info(
91
- "FLM v%s is installed but below minimum version v%s. Upgrading...",
131
+ "FLM v%s is installed but below latest version v%s. Upgrading...",
92
132
  current_version,
93
- FLM_MINIMUM_VERSION,
133
+ latest_version,
94
134
  )
135
+ verysilent = True
95
136
  else:
96
- logging.info(
97
- "FLM not found. Installing FLM v%s or later...", FLM_MINIMUM_VERSION
98
- )
137
+ logging.info("FLM not found. Installing FLM v%s or later...", latest_version)
138
+ verysilent = False
99
139
 
100
140
  # Download the installer
101
141
  # pylint: disable=line-too-long
102
142
  installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
103
143
  installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
144
+ installer_args = [installer_path, "/VERYSILENT"] if verysilent else [installer_path]
104
145
 
105
146
  try:
106
147
  # Remove existing installer if present
@@ -123,13 +164,15 @@ def install_flm():
123
164
  # Launch the installer GUI
124
165
  logging.warning(
125
166
  "Launching FLM installer GUI. Please complete the installation..."
167
+ if not verysilent
168
+ else "Installing FLM..."
126
169
  )
127
170
 
128
171
  # Launch installer and wait for it to complete
129
172
  if os.name == "nt": # Windows
130
- process = subprocess.Popen([installer_path], shell=True)
173
+ process = subprocess.Popen(installer_args, shell=True)
131
174
  else:
132
- process = subprocess.Popen([installer_path])
175
+ process = subprocess.Popen(installer_args)
133
176
 
134
177
  # Wait for installer to complete
135
178
  process.wait()
@@ -150,8 +193,8 @@ def install_flm():
150
193
  # Verify installation
151
194
  max_retries = 10
152
195
  for attempt in range(max_retries):
153
- new_version = check_flm_version()
154
- if new_version and Version(new_version) >= Version(FLM_MINIMUM_VERSION):
196
+ new_version, latest_version = check_flm_version()
197
+ if new_version and Version(new_version) == Version(latest_version):
155
198
  logging.info("FLM v%s successfully installed and verified", new_version)
156
199
  return
157
200
 
@@ -240,7 +283,12 @@ def get_flm_installed_models() -> List[str]:
240
283
 
241
284
  return installed_checkpoints
242
285
 
243
- except (subprocess.CalledProcessError, FileNotFoundError, AttributeError):
286
+ except (
287
+ subprocess.CalledProcessError,
288
+ FileNotFoundError,
289
+ AttributeError,
290
+ NotADirectoryError,
291
+ ):
244
292
  # FLM not installed, not available, or output parsing failed
245
293
  return []
246
294
 
@@ -249,7 +297,7 @@ def is_flm_available() -> bool:
249
297
  """
250
298
  Check if FLM is available and meets minimum version requirements.
251
299
  """
252
- current_version = check_flm_version()
253
- return current_version is not None and Version(current_version) >= Version(
254
- FLM_MINIMUM_VERSION
300
+ current_version, latest_version = check_flm_version()
301
+ return current_version is not None and Version(current_version) == Version(
302
+ latest_version
255
303
  )
@@ -0,0 +1,224 @@
1
+ import argparse
2
+ import statistics
3
+ from statistics import StatisticsError
4
+ from lemonade.state import State
5
+ from lemonade.tools.tool import Tool
6
+ from lemonade.tools.llamacpp.utils import LlamaCppAdapter
7
+ from lemonade.tools.bench import (
8
+ Bench,
9
+ default_prompt_length,
10
+ default_iterations,
11
+ default_output_tokens,
12
+ default_warmup_runs,
13
+ )
14
+
15
+
16
+ class LlamaCppBench(Bench):
17
+ """
18
+ Benchmark a llama.cpp model
19
+ """
20
+
21
+ unique_name = "llamacpp-bench"
22
+
23
+ def __init__(self, monitor_message="Benchmarking LLM"):
24
+ super().__init__(monitor_message)
25
+
26
+ # Don't track memory usage since we are using a llamacpp executable for compute
27
+ self.save_max_memory_used = False
28
+
29
+ @staticmethod
30
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
31
+ parser = __class__.helpful_parser(
32
+ short_description="Benchmark an LLM in llama.cpp",
33
+ add_help=add_help,
34
+ )
35
+
36
+ parser = Bench.parser(parser)
37
+
38
+ parser.add_argument(
39
+ "--cli",
40
+ action="store_true",
41
+ help="Set this flag to use llama-cli.exe to benchmark model performance. This executable will be called "
42
+ "once per iteration. Otherwise, llama-bench.exe is used by default. In this default behavior behavior, "
43
+ "the only valid prompt format is integer token lengths. Also, the warmup-iterations parameter is "
44
+ "ignored and the default value for number of threads is 16.",
45
+ )
46
+
47
+ return parser
48
+
49
+ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
50
+ """
51
+ Helper function to parse CLI arguments into the args expected by run()
52
+ """
53
+
54
+ # Call Tool parse method, NOT the Bench parse method
55
+ parsed_args = Tool.parse(self, state, args, known_only)
56
+
57
+ if parsed_args.cli:
58
+ parsed_args = super().parse(state, args, known_only)
59
+ else:
60
+ # Make sure prompts is a list of integers
61
+ if parsed_args.prompts is None:
62
+ parsed_args.prompts = [default_prompt_length]
63
+ prompt_ints = []
64
+ for prompt_item in parsed_args.prompts:
65
+ if prompt_item.isdigit():
66
+ prompt_ints.append(int(prompt_item))
67
+ else:
68
+ raise Exception(
69
+ f"When not using the --cli flag to {self.unique_name}, the prompt format must "
70
+ "be in integer format."
71
+ )
72
+ parsed_args.prompts = prompt_ints
73
+
74
+ return parsed_args
75
+
76
+ def run_prompt(
77
+ self,
78
+ state: State,
79
+ report_progress_fn,
80
+ prompt: str,
81
+ iterations: int,
82
+ warmup_iterations: int,
83
+ output_tokens: int,
84
+ ) -> State:
85
+ """
86
+ Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
87
+ """
88
+
89
+ if self.first_run_prompt:
90
+
91
+ if not hasattr(state, "model") or not isinstance(
92
+ state.model, LlamaCppAdapter
93
+ ):
94
+ raise Exception(
95
+ f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
96
+ "loaded first. Please run load-llama-cpp before this tool."
97
+ )
98
+ model: LlamaCppAdapter = state.model
99
+
100
+ per_iteration_tokens_per_second = []
101
+ per_iteration_time_to_first_token = []
102
+
103
+ for iteration in range(iterations + warmup_iterations):
104
+ try:
105
+ # Use the adapter's generate method which already has the timeout
106
+ # and error handling
107
+ model.time_to_first_token = None
108
+ model.tokens_per_second = None
109
+ raw_output, stderr = model.generate(
110
+ prompt, max_new_tokens=output_tokens, return_raw=True
111
+ )
112
+
113
+ if model.time_to_first_token is None or model.tokens_per_second is None:
114
+ error_msg = (
115
+ "Could not find timing information in llama.cpp output.\n"
116
+ )
117
+ error_msg += "Raw output:\n" + raw_output + "\n"
118
+ error_msg += "Stderr:\n" + stderr
119
+ raise Exception(error_msg)
120
+
121
+ self.tokens_out_len_list.append(model.response_tokens)
122
+
123
+ if iteration > warmup_iterations - 1:
124
+ per_iteration_tokens_per_second.append(model.tokens_per_second)
125
+ per_iteration_time_to_first_token.append(model.time_to_first_token)
126
+
127
+ report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
128
+
129
+ except Exception as e:
130
+ error_msg = f"Failed to run benchmark: {str(e)}"
131
+ raise Exception(error_msg)
132
+
133
+ self.input_ids_len_list.append(model.prompt_tokens)
134
+ mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
135
+ self.mean_time_to_first_token_list.append(mean_time_to_first_token)
136
+ self.prefill_tokens_per_second_list.append(
137
+ model.prompt_tokens / mean_time_to_first_token
138
+ )
139
+ self.token_generation_tokens_per_second_list.append(
140
+ statistics.mean(per_iteration_tokens_per_second)
141
+ )
142
+ try:
143
+ self.std_dev_time_to_first_token_list.append(
144
+ statistics.stdev(per_iteration_time_to_first_token)
145
+ )
146
+ except StatisticsError:
147
+ # Less than 2 measurements
148
+ self.std_dev_time_to_first_token_list.append(None)
149
+ try:
150
+ self.std_dev_token_generation_tokens_per_second_list.append(
151
+ statistics.stdev(per_iteration_tokens_per_second)
152
+ )
153
+ except StatisticsError:
154
+ # Less than 2 measurements
155
+ self.std_dev_token_generation_tokens_per_second_list.append(None)
156
+
157
+ def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
158
+
159
+ if prompts is None:
160
+ prompts = [default_prompt_length]
161
+ elif isinstance(prompts, int):
162
+ prompts = [prompts]
163
+
164
+ state.save_stat("prompts", prompts)
165
+ state.save_stat("iterations", iterations)
166
+ state.save_stat("output_tokens", output_tokens)
167
+
168
+ model: LlamaCppAdapter = state.model
169
+ prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd = model.benchmark(
170
+ prompts, iterations, output_tokens
171
+ )
172
+ self.input_ids_len_list = prompt_lengths
173
+ self.prefill_tokens_per_second_list = pp_tps
174
+ if iterations > 1:
175
+ self.std_dev_prefill_tokens_per_second_list = pp_tps_sd
176
+ self.mean_time_to_first_token_list = [
177
+ tokens / tps for tokens, tps in zip(prompt_lengths, pp_tps)
178
+ ]
179
+ self.token_generation_tokens_per_second_list = [tg_tps]
180
+ if iterations > 1:
181
+ self.std_dev_token_generation_tokens_per_second_list = [tg_tps_sd]
182
+ self.tokens_out_len_list = [output_tokens] * len(prompts) * iterations
183
+
184
+ self.save_stats(state)
185
+ return state
186
+
187
+ def run(
188
+ self,
189
+ state: State,
190
+ prompts: list[str] = None,
191
+ iterations: int = default_iterations,
192
+ warmup_iterations: int = default_warmup_runs,
193
+ output_tokens: int = default_output_tokens,
194
+ cli: bool = False,
195
+ **kwargs,
196
+ ) -> State:
197
+ """
198
+ Args:
199
+ - prompts: List of input prompts used as starting points for LLM text generation
200
+ - iterations: Number of benchmarking samples to take; results are
201
+ reported as the median and mean of the samples.
202
+ - warmup_iterations: Subset of the iterations to treat as warmup,
203
+ and not included in the results.
204
+ - output_tokens: Number of new tokens LLM to create.
205
+ - ggml: Use llama-bench.exe directly
206
+ - kwargs: Additional parameters used by bench tools
207
+ """
208
+
209
+ # Check that state has the attribute model and it is a LlamaCPP model
210
+ if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
211
+ raise Exception("Load model using llamacpp-load first.")
212
+
213
+ if cli:
214
+ state = super().run(
215
+ state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
216
+ )
217
+ else:
218
+ state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
219
+
220
+ return state
221
+
222
+
223
+ # This file was originally licensed under Apache 2.0. It has been modified.
224
+ # Modifications Copyright (c) 2025 AMD
@@ -93,9 +93,11 @@ class LoadLlamaCpp(FirstTool):
93
93
  from lemonade.tools.llamacpp.utils import (
94
94
  install_llamacpp,
95
95
  get_llama_cli_exe_path,
96
+ get_llama_bench_exe_path,
96
97
  get_llama_installed_version,
97
98
  parse_checkpoint,
98
99
  download_gguf,
100
+ resolve_local_gguf_model,
99
101
  get_local_checkpoint_path,
100
102
  LlamaCppTokenizerAdapter,
101
103
  LlamaCppAdapter,
@@ -103,6 +105,8 @@ class LoadLlamaCpp(FirstTool):
103
105
 
104
106
  install_llamacpp(backend)
105
107
 
108
+ extension = ""
109
+
106
110
  # Check if input is a local folder containing a .GGUF model
107
111
  if os.path.isdir(input):
108
112
  # input is a local folder
@@ -121,6 +125,17 @@ class LoadLlamaCpp(FirstTool):
121
125
  )
122
126
  model_to_use = gguf_files[0]
123
127
  full_model_path = os.path.join(local_model_folder, model_to_use)
128
+ extension = ".gguf"
129
+
130
+ elif input.endswith(".gguf") and os.path.isfile(input):
131
+ # input is a local .gguf file
132
+ full_model_path = os.path.abspath(input)
133
+ checkpoint = "local_model"
134
+ state.checkpoint = checkpoint
135
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
136
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
137
+ model_to_use = os.path.basename(full_model_path)
138
+ extension = ".gguf"
124
139
 
125
140
  else:
126
141
  # Input is a model checkpoint
@@ -155,12 +170,21 @@ class LoadLlamaCpp(FirstTool):
155
170
  )
156
171
 
157
172
  else:
173
+ # First, try to resolve from local cache to avoid unnecessary downloads
174
+ base_checkpoint, variant = parse_checkpoint(checkpoint)
175
+ snapshot_files = resolve_local_gguf_model(
176
+ base_checkpoint, variant, None
177
+ )
178
+
179
+ # If not found locally, download from internet
180
+ if not snapshot_files:
181
+ snapshot_files = download_gguf(checkpoint)
158
182
 
159
- snapshot_files = download_gguf(checkpoint)
160
183
  full_model_path = snapshot_files["variant"]
161
184
  model_to_use = os.path.basename(full_model_path)
162
185
 
163
186
  llama_cli_exe_path = get_llama_cli_exe_path(backend)
187
+ llama_bench_exe_path = get_llama_bench_exe_path(backend)
164
188
  printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
165
189
 
166
190
  # Get the directory containing the executable for shared libraries
@@ -174,8 +198,10 @@ class LoadLlamaCpp(FirstTool):
174
198
  context_size=context_size,
175
199
  threads=threads,
176
200
  executable=llama_cli_exe_path,
201
+ bench_executable=llama_bench_exe_path,
177
202
  reasoning=reasoning,
178
203
  lib_dir=lib_dir,
204
+ state=state,
179
205
  )
180
206
  state.tokenizer = LlamaCppTokenizerAdapter()
181
207
  state.device = device
@@ -186,7 +212,9 @@ class LoadLlamaCpp(FirstTool):
186
212
  Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
187
213
  )
188
214
 
189
- status.add_to_state(state=state, name=input, model=model_to_use)
215
+ status.add_to_state(
216
+ state=state, name=input, model=model_to_use, extension=extension
217
+ )
190
218
  return state
191
219
 
192
220