lemonade-sdk 8.0.4__py3-none-any.whl → 8.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/api.py +50 -0
- lemonade/cache.py +3 -1
- lemonade/common/inference_engines.py +415 -0
- lemonade/common/system_info.py +493 -47
- lemonade/tools/adapter.py +6 -0
- lemonade/tools/huggingface/utils.py +6 -5
- lemonade/tools/llamacpp/bench.py +26 -46
- lemonade/tools/llamacpp/load.py +104 -196
- lemonade/tools/llamacpp/utils.py +612 -0
- lemonade/tools/management_tools.py +53 -7
- lemonade/tools/oga/bench.py +5 -6
- lemonade/tools/oga/utils.py +8 -2
- lemonade/tools/prompt.py +17 -25
- lemonade/tools/report/table.py +12 -9
- lemonade/tools/server/llamacpp.py +80 -92
- lemonade/tools/server/serve.py +32 -0
- lemonade/tools/server/static/styles.css +137 -58
- lemonade/tools/server/static/webapp.html +34 -8
- lemonade/tools/server/tray.py +7 -0
- lemonade/version.py +1 -1
- lemonade_sdk-8.0.6.dist-info/METADATA +295 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/RECORD +30 -28
- lemonade_server/cli.py +168 -22
- lemonade_server/model_manager.py +4 -148
- lemonade_server/server_models.json +11 -0
- lemonade_sdk-8.0.4.dist-info/METADATA +0 -176
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/top_level.txt +0 -0
lemonade/tools/adapter.py
CHANGED
|
@@ -13,6 +13,9 @@ class ModelAdapter(abc.ABC):
|
|
|
13
13
|
"""
|
|
14
14
|
self.tokens_per_second = None
|
|
15
15
|
self.time_to_first_token = None
|
|
16
|
+
self.prompt_tokens = None
|
|
17
|
+
self.response_tokens = None
|
|
18
|
+
|
|
16
19
|
self.type = "generic"
|
|
17
20
|
|
|
18
21
|
@abc.abstractmethod
|
|
@@ -22,6 +25,9 @@ class ModelAdapter(abc.ABC):
|
|
|
22
25
|
|
|
23
26
|
We try to keep the signature here minimal to allow for maximum compatibility
|
|
24
27
|
with recipe components, which themselves may not support a lot of arguments.
|
|
28
|
+
|
|
29
|
+
The generate method should store prompt and response lengths (in tokens)
|
|
30
|
+
in the prompt_tokens and response_tokens members.
|
|
25
31
|
"""
|
|
26
32
|
|
|
27
33
|
|
|
@@ -108,7 +108,9 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
108
108
|
with torch.no_grad(), torch.inference_mode():
|
|
109
109
|
outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
|
|
110
110
|
|
|
111
|
-
|
|
111
|
+
self.prompt_tokens = input_ids.shape[1]
|
|
112
|
+
self.response_tokens = len(outputs[0]) - self.prompt_tokens
|
|
113
|
+
return outputs
|
|
112
114
|
|
|
113
115
|
def _model_call(self, input_tensor):
|
|
114
116
|
"""Forward pass through the model to get logits
|
|
@@ -341,12 +343,11 @@ def benchmark_huggingface_llm(
|
|
|
341
343
|
|
|
342
344
|
latency = end_time - start_time
|
|
343
345
|
|
|
344
|
-
|
|
345
|
-
tokens_out_len_list.append(token_len)
|
|
346
|
+
tokens_out_len_list.append(model.response_tokens)
|
|
346
347
|
|
|
347
348
|
# Only count an iteration if it produced enough tokens
|
|
348
|
-
if
|
|
349
|
-
per_iteration_result.append((latency,
|
|
349
|
+
if model.response_tokens >= target_output_tokens:
|
|
350
|
+
per_iteration_result.append((latency, model.response_tokens))
|
|
350
351
|
|
|
351
352
|
report_progress_fn(
|
|
352
353
|
(warmup_iterations + count + 1) / (warmup_iterations + iterations)
|
lemonade/tools/llamacpp/bench.py
CHANGED
|
@@ -3,27 +3,31 @@ import statistics
|
|
|
3
3
|
from statistics import StatisticsError
|
|
4
4
|
from lemonade.state import State
|
|
5
5
|
from lemonade.cache import Keys
|
|
6
|
-
from lemonade.tools.llamacpp.
|
|
6
|
+
from lemonade.tools.llamacpp.utils import LlamaCppAdapter
|
|
7
7
|
from lemonade.tools.bench import Bench
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class LlamaCppBench(Bench):
|
|
11
|
+
"""
|
|
12
|
+
Benchmark a llama.cpp model
|
|
13
|
+
"""
|
|
11
14
|
|
|
12
|
-
unique_name = "
|
|
15
|
+
unique_name = "llamacpp-bench"
|
|
13
16
|
|
|
14
17
|
def __init__(self):
|
|
15
18
|
super().__init__()
|
|
16
19
|
|
|
17
20
|
# Additional statistics generated by this bench tool
|
|
18
|
-
self.status_stats
|
|
21
|
+
self.status_stats.insert(
|
|
22
|
+
self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
|
|
19
23
|
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
20
|
-
|
|
24
|
+
)
|
|
21
25
|
self.std_dev_token_generation_tokens_per_second_list = []
|
|
22
26
|
|
|
23
27
|
@staticmethod
|
|
24
28
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
25
29
|
parser = __class__.helpful_parser(
|
|
26
|
-
short_description="Benchmark
|
|
30
|
+
short_description="Benchmark an LLM in llama.cpp",
|
|
27
31
|
add_help=add_help,
|
|
28
32
|
)
|
|
29
33
|
|
|
@@ -53,38 +57,20 @@ class LlamaCppBench(Bench):
|
|
|
53
57
|
f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
|
|
54
58
|
"loaded first. Please run load-llama-cpp before this tool."
|
|
55
59
|
)
|
|
60
|
+
model: LlamaCppAdapter = state.model
|
|
56
61
|
|
|
57
|
-
|
|
58
|
-
|
|
62
|
+
per_iteration_tokens_per_second = []
|
|
63
|
+
per_iteration_time_to_first_token = []
|
|
59
64
|
|
|
60
65
|
for iteration in range(iterations + warmup_iterations):
|
|
61
66
|
try:
|
|
62
67
|
# Use the adapter's generate method which already has the timeout
|
|
63
68
|
# and error handling
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
input_tokens = None
|
|
70
|
-
|
|
71
|
-
# Look for timing in both stdout and stderr
|
|
72
|
-
for output in [raw_output, stderr]:
|
|
73
|
-
for line in output.splitlines():
|
|
74
|
-
if "llama_perf_context_print: eval time =" in line:
|
|
75
|
-
parts = line.split("(")[1].strip()
|
|
76
|
-
parts = parts.split(",")
|
|
77
|
-
ms_per_token = float(
|
|
78
|
-
parts[0].split("ms per token")[0].strip()
|
|
79
|
-
)
|
|
80
|
-
if "llama_perf_context_print: prompt eval time =" in line:
|
|
81
|
-
parts = line.split("=")[1].split("/")
|
|
82
|
-
time_to_first_token_ms = float(
|
|
83
|
-
parts[0].split("ms")[0].strip()
|
|
84
|
-
)
|
|
85
|
-
input_tokens = int(parts[1].split("tokens")[0].strip())
|
|
86
|
-
|
|
87
|
-
if ms_per_token is None or time_to_first_token_ms is None:
|
|
69
|
+
model.time_to_first_token = None
|
|
70
|
+
model.tokens_per_second = None
|
|
71
|
+
raw_output, stderr = model.generate(prompt, return_raw=True)
|
|
72
|
+
|
|
73
|
+
if model.time_to_first_token is None or model.tokens_per_second is None:
|
|
88
74
|
error_msg = (
|
|
89
75
|
"Could not find timing information in llama.cpp output.\n"
|
|
90
76
|
)
|
|
@@ -92,17 +78,11 @@ class LlamaCppBench(Bench):
|
|
|
92
78
|
error_msg += "Stderr:\n" + stderr
|
|
93
79
|
raise Exception(error_msg)
|
|
94
80
|
|
|
95
|
-
|
|
96
|
-
# and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
|
|
97
|
-
# as performance data for generating a few tokens is not relevant.
|
|
98
|
-
tokens_per_second = 0
|
|
99
|
-
if output_tokens > 5 and ms_per_token > 0:
|
|
100
|
-
tokens_per_second = 1000 / ms_per_token
|
|
101
|
-
time_to_first_token = time_to_first_token_ms / 1000
|
|
81
|
+
self.tokens_out_len_list.append(model.response_tokens)
|
|
102
82
|
|
|
103
83
|
if iteration > warmup_iterations - 1:
|
|
104
|
-
|
|
105
|
-
|
|
84
|
+
per_iteration_tokens_per_second.append(model.tokens_per_second)
|
|
85
|
+
per_iteration_time_to_first_token.append(model.time_to_first_token)
|
|
106
86
|
|
|
107
87
|
report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
|
|
108
88
|
|
|
@@ -110,25 +90,25 @@ class LlamaCppBench(Bench):
|
|
|
110
90
|
error_msg = f"Failed to run benchmark: {str(e)}"
|
|
111
91
|
raise Exception(error_msg)
|
|
112
92
|
|
|
113
|
-
self.input_ids_len_list.append(
|
|
114
|
-
mean_time_to_first_token = statistics.mean(
|
|
93
|
+
self.input_ids_len_list.append(model.prompt_tokens)
|
|
94
|
+
mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
|
|
115
95
|
self.mean_time_to_first_token_list.append(mean_time_to_first_token)
|
|
116
96
|
self.prefill_tokens_per_second_list.append(
|
|
117
|
-
|
|
97
|
+
model.prompt_tokens / mean_time_to_first_token
|
|
118
98
|
)
|
|
119
99
|
self.token_generation_tokens_per_second_list.append(
|
|
120
|
-
statistics.mean(
|
|
100
|
+
statistics.mean(per_iteration_tokens_per_second)
|
|
121
101
|
)
|
|
122
102
|
try:
|
|
123
103
|
self.std_dev_time_to_first_token_list.append(
|
|
124
|
-
statistics.stdev(
|
|
104
|
+
statistics.stdev(per_iteration_time_to_first_token)
|
|
125
105
|
)
|
|
126
106
|
except StatisticsError:
|
|
127
107
|
# Less than 2 measurements
|
|
128
108
|
self.std_dev_time_to_first_token_list.append(None)
|
|
129
109
|
try:
|
|
130
110
|
self.std_dev_token_generation_tokens_per_second_list.append(
|
|
131
|
-
statistics.stdev(
|
|
111
|
+
statistics.stdev(per_iteration_tokens_per_second)
|
|
132
112
|
)
|
|
133
113
|
except StatisticsError:
|
|
134
114
|
# Less than 2 measurements
|
lemonade/tools/llamacpp/load.py
CHANGED
|
@@ -1,166 +1,22 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import os
|
|
3
|
-
|
|
4
|
-
import subprocess
|
|
5
|
-
from lemonade.state import State
|
|
3
|
+
import lemonade.common.printing as printing
|
|
6
4
|
import lemonade.common.status as status
|
|
5
|
+
from lemonade.state import State
|
|
7
6
|
from lemonade.tools import FirstTool
|
|
8
|
-
from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
|
|
9
7
|
from lemonade.cache import Keys
|
|
10
8
|
|
|
11
9
|
|
|
12
|
-
class LlamaCppAdapter(ModelAdapter):
|
|
13
|
-
def __init__(
|
|
14
|
-
self, model, output_tokens, context_size, threads, executable, lib_dir=None
|
|
15
|
-
):
|
|
16
|
-
super().__init__()
|
|
17
|
-
|
|
18
|
-
self.model = os.path.normpath(model)
|
|
19
|
-
self.output_tokens = output_tokens
|
|
20
|
-
self.context_size = context_size
|
|
21
|
-
self.threads = threads
|
|
22
|
-
self.executable = os.path.normpath(executable)
|
|
23
|
-
self.lib_dir = lib_dir
|
|
24
|
-
|
|
25
|
-
def generate(
|
|
26
|
-
self,
|
|
27
|
-
input_ids: str,
|
|
28
|
-
max_new_tokens: Optional[int] = None,
|
|
29
|
-
temperature: float = 0.8,
|
|
30
|
-
top_p: float = 0.95,
|
|
31
|
-
top_k: int = 40,
|
|
32
|
-
return_raw: bool = False,
|
|
33
|
-
**kwargs, # pylint: disable=unused-argument
|
|
34
|
-
):
|
|
35
|
-
"""
|
|
36
|
-
Pass a text prompt into the llamacpp inference CLI.
|
|
37
|
-
|
|
38
|
-
The input_ids arg here should receive the original text that
|
|
39
|
-
would normally be encoded by a tokenizer.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
input_ids: The input text prompt
|
|
43
|
-
max_new_tokens: Maximum number of tokens to generate
|
|
44
|
-
temperature: Temperature for sampling (0.0 = greedy)
|
|
45
|
-
top_p: Top-p sampling threshold
|
|
46
|
-
top_k: Top-k sampling threshold
|
|
47
|
-
return_raw: If True, returns the complete raw output including timing info
|
|
48
|
-
**kwargs: Additional arguments (ignored)
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
List containing a single string with the generated text, or raw output if
|
|
52
|
-
return_raw=True
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
prompt = input_ids
|
|
56
|
-
n_predict = max_new_tokens if max_new_tokens is not None else self.output_tokens
|
|
57
|
-
|
|
58
|
-
cmd = [
|
|
59
|
-
self.executable,
|
|
60
|
-
"-m",
|
|
61
|
-
self.model,
|
|
62
|
-
"--ctx-size",
|
|
63
|
-
str(self.context_size),
|
|
64
|
-
"-n",
|
|
65
|
-
str(n_predict),
|
|
66
|
-
"-t",
|
|
67
|
-
str(self.threads),
|
|
68
|
-
"-p",
|
|
69
|
-
prompt,
|
|
70
|
-
"--temp",
|
|
71
|
-
str(temperature),
|
|
72
|
-
"--top-p",
|
|
73
|
-
str(top_p),
|
|
74
|
-
"--top-k",
|
|
75
|
-
str(top_k),
|
|
76
|
-
"-e",
|
|
77
|
-
"-no-cnv",
|
|
78
|
-
]
|
|
79
|
-
|
|
80
|
-
cmd = [str(m) for m in cmd]
|
|
81
|
-
|
|
82
|
-
try:
|
|
83
|
-
# Set up environment with library path for Linux
|
|
84
|
-
env = os.environ.copy()
|
|
85
|
-
if self.lib_dir and os.name != "nt": # Not Windows
|
|
86
|
-
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
87
|
-
if current_ld_path:
|
|
88
|
-
env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
|
|
89
|
-
else:
|
|
90
|
-
env["LD_LIBRARY_PATH"] = self.lib_dir
|
|
91
|
-
|
|
92
|
-
process = subprocess.Popen(
|
|
93
|
-
cmd,
|
|
94
|
-
stdout=subprocess.PIPE,
|
|
95
|
-
stderr=subprocess.PIPE,
|
|
96
|
-
universal_newlines=True,
|
|
97
|
-
encoding="utf-8",
|
|
98
|
-
errors="replace",
|
|
99
|
-
env=env,
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
raw_output, stderr = process.communicate(timeout=600)
|
|
103
|
-
if process.returncode != 0:
|
|
104
|
-
error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
|
|
105
|
-
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
106
|
-
error_msg += f"Error output:\n{stderr}\n"
|
|
107
|
-
error_msg += f"Standard output:\n{raw_output}"
|
|
108
|
-
raise Exception(error_msg)
|
|
109
|
-
|
|
110
|
-
if raw_output is None:
|
|
111
|
-
raise Exception("No output received from llama.cpp process")
|
|
112
|
-
|
|
113
|
-
# Parse timing information
|
|
114
|
-
for line in raw_output.splitlines():
|
|
115
|
-
if "llama_perf_context_print: eval time =" in line:
|
|
116
|
-
parts = line.split("(")[1].strip()
|
|
117
|
-
parts = parts.split(",")
|
|
118
|
-
ms_per_token = float(parts[0].split("ms per token")[0].strip())
|
|
119
|
-
self.tokens_per_second = (
|
|
120
|
-
1000 / ms_per_token if ms_per_token > 0 else 0
|
|
121
|
-
)
|
|
122
|
-
if "llama_perf_context_print: prompt eval time =" in line:
|
|
123
|
-
parts = line.split("=")[1].split("/")[0]
|
|
124
|
-
time_to_first_token_ms = float(parts.split("ms")[0].strip())
|
|
125
|
-
self.time_to_first_token = time_to_first_token_ms / 1000
|
|
126
|
-
|
|
127
|
-
if return_raw:
|
|
128
|
-
return [raw_output, stderr]
|
|
129
|
-
|
|
130
|
-
# Find where the prompt ends and the generated text begins
|
|
131
|
-
prompt_found = False
|
|
132
|
-
output_text = ""
|
|
133
|
-
prompt_first_line = prompt.split("\n")[0]
|
|
134
|
-
for line in raw_output.splitlines():
|
|
135
|
-
if prompt_first_line in line:
|
|
136
|
-
prompt_found = True
|
|
137
|
-
if prompt_found:
|
|
138
|
-
line = line.replace("</s> [end of text]", "")
|
|
139
|
-
output_text = output_text + line
|
|
140
|
-
|
|
141
|
-
if not prompt_found:
|
|
142
|
-
raise Exception(
|
|
143
|
-
f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
|
|
144
|
-
"This usually means the model failed to process the prompt correctly.\n"
|
|
145
|
-
f"Raw output:\n{raw_output}\n"
|
|
146
|
-
f"Stderr:\n{stderr}"
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
# Return list containing the generated text
|
|
150
|
-
return [output_text]
|
|
151
|
-
|
|
152
|
-
except Exception as e:
|
|
153
|
-
error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
|
|
154
|
-
error_msg += f"Command: {' '.join(cmd)}"
|
|
155
|
-
raise Exception(error_msg)
|
|
156
|
-
|
|
157
|
-
|
|
158
10
|
class LoadLlamaCpp(FirstTool):
|
|
159
|
-
unique_name = "load
|
|
11
|
+
unique_name = "llamacpp-load"
|
|
160
12
|
|
|
161
13
|
def __init__(self):
|
|
162
14
|
super().__init__(monitor_message="Loading llama.cpp model")
|
|
163
15
|
|
|
16
|
+
self.status_stats = [
|
|
17
|
+
Keys.DEVICE,
|
|
18
|
+
]
|
|
19
|
+
|
|
164
20
|
@staticmethod
|
|
165
21
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
166
22
|
parser = __class__.helpful_parser(
|
|
@@ -169,28 +25,29 @@ class LoadLlamaCpp(FirstTool):
|
|
|
169
25
|
)
|
|
170
26
|
|
|
171
27
|
parser.add_argument(
|
|
172
|
-
"
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
28
|
+
"-d",
|
|
29
|
+
"--device",
|
|
30
|
+
choices=["cpu", "igpu"],
|
|
31
|
+
default="igpu",
|
|
32
|
+
help="Which device to load the model on to (default: igpu)",
|
|
176
33
|
)
|
|
177
34
|
|
|
178
|
-
default_threads = 1
|
|
35
|
+
default_threads = -1
|
|
179
36
|
parser.add_argument(
|
|
180
37
|
"--threads",
|
|
181
38
|
required=False,
|
|
182
39
|
type=int,
|
|
183
40
|
default=default_threads,
|
|
184
|
-
help=f"Number of threads to use
|
|
41
|
+
help=f"Number of threads to use during generation (default: {default_threads})",
|
|
185
42
|
)
|
|
186
43
|
|
|
187
|
-
context_size =
|
|
44
|
+
context_size = 4096
|
|
188
45
|
parser.add_argument(
|
|
189
46
|
"--context-size",
|
|
190
47
|
required=False,
|
|
191
48
|
type=int,
|
|
192
49
|
default=context_size,
|
|
193
|
-
help=f"
|
|
50
|
+
help=f"Size of the prompt context (default: {context_size}. 0 = loaded from model)",
|
|
194
51
|
)
|
|
195
52
|
|
|
196
53
|
output_tokens = 512
|
|
@@ -199,14 +56,13 @@ class LoadLlamaCpp(FirstTool):
|
|
|
199
56
|
required=False,
|
|
200
57
|
type=int,
|
|
201
58
|
default=output_tokens,
|
|
202
|
-
help=f"Maximum number of output tokens
|
|
59
|
+
help=f"Maximum number of output tokens to generate (default: {output_tokens})",
|
|
203
60
|
)
|
|
204
61
|
|
|
205
62
|
parser.add_argument(
|
|
206
|
-
"--
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
help="Path to a .gguf model file",
|
|
63
|
+
"--reasoning",
|
|
64
|
+
action="store_true",
|
|
65
|
+
help="Set this flag to indicate the model is a reasoning model",
|
|
210
66
|
)
|
|
211
67
|
|
|
212
68
|
return parser
|
|
@@ -215,61 +71,113 @@ class LoadLlamaCpp(FirstTool):
|
|
|
215
71
|
self,
|
|
216
72
|
state: State,
|
|
217
73
|
input: str = "",
|
|
74
|
+
device: str = "igpu",
|
|
218
75
|
context_size: int = 512,
|
|
219
76
|
threads: int = 1,
|
|
220
77
|
output_tokens: int = 512,
|
|
221
|
-
|
|
222
|
-
executable: str = None,
|
|
223
|
-
lib_dir: Optional[str] = None,
|
|
78
|
+
reasoning: bool = False,
|
|
224
79
|
) -> State:
|
|
225
80
|
"""
|
|
226
81
|
Load a llama.cpp model
|
|
227
82
|
"""
|
|
228
83
|
|
|
229
|
-
from lemonade.common.network import
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
84
|
+
from lemonade.common.network import is_offline
|
|
85
|
+
from lemonade.tools.llamacpp.utils import (
|
|
86
|
+
install_llamacpp,
|
|
87
|
+
get_llama_cli_exe_path,
|
|
88
|
+
get_llama_installed_version,
|
|
89
|
+
parse_checkpoint,
|
|
90
|
+
download_gguf,
|
|
91
|
+
get_local_checkpoint_path,
|
|
92
|
+
LlamaCppTokenizerAdapter,
|
|
93
|
+
LlamaCppAdapter,
|
|
94
|
+
)
|
|
233
95
|
|
|
234
|
-
#
|
|
235
|
-
|
|
96
|
+
# Validate and install llama.cpp, if needed
|
|
97
|
+
install_llamacpp()
|
|
98
|
+
|
|
99
|
+
# Check if input is a local folder containing a .GGUF model
|
|
100
|
+
if os.path.isdir(input):
|
|
101
|
+
# input is a local folder
|
|
102
|
+
local_model_folder = os.path.abspath(input)
|
|
103
|
+
checkpoint = "local_model"
|
|
104
|
+
state.checkpoint = checkpoint
|
|
105
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
106
|
+
state.save_stat(Keys.LOCAL_MODEL_FOLDER, local_model_folder)
|
|
107
|
+
|
|
108
|
+
# See if there is a file ending in ".gguf" in this folder
|
|
109
|
+
dir = os.listdir(input)
|
|
110
|
+
gguf_files = [filename for filename in dir if filename.endswith(".gguf")]
|
|
111
|
+
if len(gguf_files) == 0:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"The folder {input} does not contain a GGUF model file."
|
|
114
|
+
)
|
|
115
|
+
model_to_use = gguf_files[0]
|
|
116
|
+
full_model_path = os.path.join(local_model_folder, model_to_use)
|
|
236
117
|
|
|
237
|
-
if model_binary:
|
|
238
|
-
model_to_use = os.path.normpath(model_binary)
|
|
239
118
|
else:
|
|
240
|
-
|
|
241
|
-
|
|
119
|
+
# Input is a model checkpoint
|
|
120
|
+
checkpoint = input
|
|
121
|
+
state.checkpoint = checkpoint
|
|
122
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
123
|
+
|
|
124
|
+
# Make sure that a variant is provided for the GGUF model
|
|
125
|
+
base_checkpoint, variant = parse_checkpoint(checkpoint)
|
|
126
|
+
if variant is None:
|
|
127
|
+
raise ValueError(
|
|
128
|
+
"You are required to provide a 'variant' when "
|
|
129
|
+
"selecting a GGUF model. The variant is provided "
|
|
130
|
+
"as CHECKPOINT:VARIANT. For example: "
|
|
131
|
+
"Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
|
|
132
|
+
"Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Auto-detect offline status
|
|
136
|
+
offline = is_offline()
|
|
137
|
+
if offline:
|
|
138
|
+
printing.log_warning(
|
|
139
|
+
"Network connectivity to huggingface.co not detected. Running in offline mode."
|
|
140
|
+
)
|
|
141
|
+
full_model_path, model_to_use = get_local_checkpoint_path(
|
|
142
|
+
base_checkpoint, variant
|
|
143
|
+
)
|
|
144
|
+
if not full_model_path:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Model {checkpoint} is not available locally."
|
|
147
|
+
f"Cannot download in offline mode."
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
else:
|
|
151
|
+
|
|
152
|
+
snapshot_files = download_gguf(checkpoint)
|
|
153
|
+
full_model_path = snapshot_files["variant"]
|
|
154
|
+
model_to_use = os.path.basename(full_model_path)
|
|
242
155
|
|
|
243
|
-
|
|
244
|
-
|
|
156
|
+
llama_cli_exe_path = get_llama_cli_exe_path()
|
|
157
|
+
printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
|
|
245
158
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
f"{self.__class__.unique_name} requires the preceding tool to pass a "
|
|
249
|
-
"Llamacpp model, "
|
|
250
|
-
"or for the user to supply a model with `--model-binary`"
|
|
251
|
-
)
|
|
159
|
+
# Get the directory containing the executable for shared libraries
|
|
160
|
+
lib_dir = os.path.dirname(llama_cli_exe_path)
|
|
252
161
|
|
|
162
|
+
# Pass the model and inputs into state
|
|
253
163
|
state.model = LlamaCppAdapter(
|
|
254
|
-
model=
|
|
164
|
+
model=full_model_path,
|
|
165
|
+
device=device,
|
|
255
166
|
output_tokens=output_tokens,
|
|
256
167
|
context_size=context_size,
|
|
257
168
|
threads=threads,
|
|
258
|
-
executable=
|
|
169
|
+
executable=llama_cli_exe_path,
|
|
170
|
+
reasoning=reasoning,
|
|
259
171
|
lib_dir=lib_dir,
|
|
260
172
|
)
|
|
261
|
-
state.tokenizer =
|
|
173
|
+
state.tokenizer = LlamaCppTokenizerAdapter()
|
|
174
|
+
state.device = device
|
|
262
175
|
|
|
263
|
-
# Save stats
|
|
264
|
-
state.save_stat(Keys.
|
|
265
|
-
|
|
266
|
-
# Get base model information if this is a converted HF model
|
|
267
|
-
base_model = get_base_model(input)
|
|
268
|
-
if base_model is not None:
|
|
269
|
-
state.save_stat("base_model", base_model)
|
|
176
|
+
# Save initial stats
|
|
177
|
+
state.save_stat(Keys.DEVICE, device)
|
|
178
|
+
state.save_stat(Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version())
|
|
270
179
|
|
|
271
180
|
status.add_to_state(state=state, name=input, model=model_to_use)
|
|
272
|
-
|
|
273
181
|
return state
|
|
274
182
|
|
|
275
183
|
|