lemonade-sdk 8.1.11__py3-none-any.whl → 8.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/cache.py +6 -1
- lemonade/common/status.py +4 -4
- lemonade/common/system_info.py +0 -26
- lemonade/tools/accuracy.py +143 -48
- lemonade/tools/adapter.py +6 -1
- lemonade/tools/bench.py +26 -8
- lemonade/tools/flm/utils.py +70 -22
- lemonade/tools/huggingface/bench.py +6 -1
- lemonade/tools/llamacpp/bench.py +146 -27
- lemonade/tools/llamacpp/load.py +30 -2
- lemonade/tools/llamacpp/utils.py +317 -21
- lemonade/tools/oga/bench.py +5 -26
- lemonade/tools/oga/load.py +49 -123
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/report/table.py +76 -8
- lemonade/tools/server/flm.py +2 -6
- lemonade/tools/server/llamacpp.py +43 -2
- lemonade/tools/server/serve.py +354 -18
- lemonade/tools/server/static/js/chat.js +15 -77
- lemonade/tools/server/static/js/model-settings.js +24 -3
- lemonade/tools/server/static/js/models.js +440 -37
- lemonade/tools/server/static/js/shared.js +61 -8
- lemonade/tools/server/static/logs.html +157 -13
- lemonade/tools/server/static/styles.css +204 -0
- lemonade/tools/server/static/webapp.html +39 -1
- lemonade/version.py +1 -1
- lemonade_install/install.py +33 -579
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +6 -4
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/RECORD +38 -37
- lemonade_server/cli.py +10 -0
- lemonade_server/model_manager.py +172 -11
- lemonade_server/pydantic_models.py +3 -0
- lemonade_server/server_models.json +102 -66
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
lemonade/tools/report/table.py
CHANGED
|
@@ -48,6 +48,18 @@ def _merge_join(str1, str2) -> str:
|
|
|
48
48
|
return str1 + ("\n" if str1 and str2 else "") + str2
|
|
49
49
|
|
|
50
50
|
|
|
51
|
+
def _window_sum(data: list, n_windows: int) -> list:
|
|
52
|
+
"""Sums data into n_windows windows"""
|
|
53
|
+
if n_windows <= 0:
|
|
54
|
+
return data
|
|
55
|
+
window_size = max(1, len(data) // n_windows)
|
|
56
|
+
summed_data = []
|
|
57
|
+
for i in range(0, len(data), window_size):
|
|
58
|
+
window_sum = sum(data[i : i + window_size])
|
|
59
|
+
summed_data.append(window_sum)
|
|
60
|
+
return summed_data
|
|
61
|
+
|
|
62
|
+
|
|
51
63
|
################################################################################
|
|
52
64
|
# CLASSES THAT DESCRIBE TEXT TABLE COLUMNS
|
|
53
65
|
################################################################################
|
|
@@ -88,10 +100,54 @@ class SimpleStat(TableColumn):
|
|
|
88
100
|
if lean and self.omit_if_lean:
|
|
89
101
|
return None
|
|
90
102
|
data = build_stats.get(self.stat, None)
|
|
91
|
-
if data is None:
|
|
92
|
-
return ""
|
|
103
|
+
if data is None or (data == []):
|
|
104
|
+
return "-"
|
|
93
105
|
if self.stat_fn:
|
|
94
106
|
data = self.stat_fn(data)
|
|
107
|
+
cell_str = "\n".join(
|
|
108
|
+
[
|
|
109
|
+
_wrap("-" if x is None else f"{x:{self.format_str}}", self.wrap)
|
|
110
|
+
for x in _to_list(data)
|
|
111
|
+
]
|
|
112
|
+
)
|
|
113
|
+
return cell_str
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class DependentStat(TableColumn):
|
|
117
|
+
"""
|
|
118
|
+
These are for statistics already declared by the tool or basic build stats that
|
|
119
|
+
rely on one or more additional stats to compute their value. The dependency is
|
|
120
|
+
embodied by the stat_fn function.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
def __init__(
|
|
124
|
+
self,
|
|
125
|
+
column_header,
|
|
126
|
+
stats,
|
|
127
|
+
format_str,
|
|
128
|
+
align="center",
|
|
129
|
+
omit_if_lean=False,
|
|
130
|
+
wrap=None,
|
|
131
|
+
stat_fn=None,
|
|
132
|
+
):
|
|
133
|
+
self.column_header = column_header
|
|
134
|
+
self.stats = stats
|
|
135
|
+
self.format_str = format_str
|
|
136
|
+
self.align = align
|
|
137
|
+
self.omit_if_lean = omit_if_lean
|
|
138
|
+
self.wrap = wrap or self.default_wrap
|
|
139
|
+
self.stat_fn = stat_fn
|
|
140
|
+
|
|
141
|
+
def get_str(self, build_stats, lean=False):
|
|
142
|
+
if lean and self.omit_if_lean:
|
|
143
|
+
return None
|
|
144
|
+
stats_data = [build_stats.get(stat, None) for stat in self.stats]
|
|
145
|
+
if self.stat_fn:
|
|
146
|
+
data = self.stat_fn(stats_data)
|
|
147
|
+
else:
|
|
148
|
+
data = stats_data[0]
|
|
149
|
+
if data is None or (data == []):
|
|
150
|
+
return "-"
|
|
95
151
|
cell_str = "\n".join(
|
|
96
152
|
[_wrap(f"{x:{self.format_str}}", self.wrap) for x in _to_list(data)]
|
|
97
153
|
)
|
|
@@ -434,10 +490,12 @@ class Table(ABC):
|
|
|
434
490
|
row = []
|
|
435
491
|
|
|
436
492
|
# First columns
|
|
493
|
+
first_columns_count = 0
|
|
437
494
|
for entry in first_columns:
|
|
438
495
|
entry_str = entry.get_str(build_stats, self.lean)
|
|
439
496
|
if entry_str is not None:
|
|
440
497
|
row.append(entry_str)
|
|
498
|
+
first_columns_count += 1
|
|
441
499
|
|
|
442
500
|
# Per tool columns
|
|
443
501
|
for tool in tools:
|
|
@@ -460,22 +518,24 @@ class Table(ABC):
|
|
|
460
518
|
row.append(entry_str)
|
|
461
519
|
|
|
462
520
|
# Final columns
|
|
521
|
+
last_columns_count = 0
|
|
463
522
|
for entry in last_columns:
|
|
464
523
|
entry_str = entry.get_str(build_stats, self.lean)
|
|
465
524
|
if entry_str is not None:
|
|
466
525
|
row.append(entry_str)
|
|
526
|
+
last_columns_count += 1
|
|
467
527
|
|
|
468
528
|
# See if this row should be merged with the last row
|
|
469
529
|
if last_build_stats and self.merge_test_fn(last_build_stats, build_stats):
|
|
470
530
|
# Merge with last row
|
|
471
|
-
for col in range(0,
|
|
531
|
+
for col in range(0, first_columns_count):
|
|
472
532
|
# If identical, don't duplicate
|
|
473
533
|
if last_row[col] != row[col]:
|
|
474
534
|
last_row[col] = _merge_join(last_row[col], row[col])
|
|
475
|
-
for col in range(
|
|
535
|
+
for col in range(first_columns_count, len(row) - last_columns_count):
|
|
476
536
|
# Allow duplicates
|
|
477
537
|
last_row[col] = _merge_join(last_row[col], row[col])
|
|
478
|
-
for col in range(len(row) -
|
|
538
|
+
for col in range(len(row) - last_columns_count, len(row)):
|
|
479
539
|
# If identical, don't duplicate
|
|
480
540
|
if last_row[col] != row[col]:
|
|
481
541
|
last_row[col] = _merge_join(last_row[col], row[col])
|
|
@@ -581,17 +641,25 @@ class LemonadePerfTable(Table):
|
|
|
581
641
|
Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
|
|
582
642
|
".2f",
|
|
583
643
|
),
|
|
644
|
+
StatWithSD(
|
|
645
|
+
_wrap("Prefill Tokens per Second", 8),
|
|
646
|
+
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
647
|
+
Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
|
|
648
|
+
".2f",
|
|
649
|
+
),
|
|
584
650
|
StatWithSD(
|
|
585
651
|
_wrap("Tokens per Second", 8),
|
|
586
652
|
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
587
653
|
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
588
654
|
".2f",
|
|
589
655
|
),
|
|
590
|
-
|
|
656
|
+
DependentStat(
|
|
591
657
|
_wrap("Total Generated Tokens", 9),
|
|
592
|
-
Keys.RESPONSE_TOKENS,
|
|
658
|
+
[Keys.RESPONSE_TOKENS, Keys.PROMPT_TOKENS],
|
|
593
659
|
"d",
|
|
594
|
-
stat_fn=lambda x:
|
|
660
|
+
stat_fn=lambda x: _window_sum(
|
|
661
|
+
_to_list(x[0]), n_windows=len(_to_list(x[1]))
|
|
662
|
+
),
|
|
595
663
|
),
|
|
596
664
|
SimpleStat(
|
|
597
665
|
_wrap("Memory Used (GB)", 8), Keys.MAX_MEMORY_USED_GBYTE, ".3f"
|
lemonade/tools/server/flm.py
CHANGED
|
@@ -42,12 +42,6 @@ class FlmServer(WrappedServer):
|
|
|
42
42
|
self.flm_model_name = None
|
|
43
43
|
super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
|
|
44
44
|
|
|
45
|
-
def _choose_port(self):
|
|
46
|
-
"""
|
|
47
|
-
`flm serve` doesn't support port selection as of v0.9.10
|
|
48
|
-
"""
|
|
49
|
-
self.port = 11434
|
|
50
|
-
|
|
51
45
|
def address(self):
|
|
52
46
|
return f"http://localhost:{self.port}/v1"
|
|
53
47
|
|
|
@@ -83,6 +77,8 @@ class FlmServer(WrappedServer):
|
|
|
83
77
|
f"{self.flm_model_name}",
|
|
84
78
|
"--ctx-len",
|
|
85
79
|
str(ctx_size),
|
|
80
|
+
"--port",
|
|
81
|
+
str(self.port),
|
|
86
82
|
]
|
|
87
83
|
|
|
88
84
|
# Set up environment with library path for Linux
|
|
@@ -15,9 +15,16 @@ from lemonade.tools.llamacpp.utils import (
|
|
|
15
15
|
get_llama_server_exe_path,
|
|
16
16
|
install_llamacpp,
|
|
17
17
|
download_gguf,
|
|
18
|
+
resolve_local_gguf_model,
|
|
19
|
+
parse_checkpoint,
|
|
18
20
|
)
|
|
19
21
|
from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
|
|
20
22
|
|
|
23
|
+
# Embedding model batch configuration set to 8192 as default
|
|
24
|
+
EMBEDDING_CTX_SIZE = 8192
|
|
25
|
+
EMBEDDING_BATCH_SIZE = 8192
|
|
26
|
+
EMBEDDING_UBATCH_SIZE = 8192
|
|
27
|
+
|
|
21
28
|
|
|
22
29
|
class LlamaTelemetry(WrappedServerTelemetry):
|
|
23
30
|
"""
|
|
@@ -101,8 +108,25 @@ class LlamaServer(WrappedServer):
|
|
|
101
108
|
self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
|
|
102
109
|
) -> dict:
|
|
103
110
|
"""
|
|
104
|
-
Download a model for the wrapper server
|
|
111
|
+
Download a model for the wrapper server.
|
|
112
|
+
First checks local cache, then downloads from internet if needed.
|
|
105
113
|
"""
|
|
114
|
+
# If it's a direct file path, just return it
|
|
115
|
+
|
|
116
|
+
if os.path.exists(config_checkpoint):
|
|
117
|
+
result = {"variant": config_checkpoint}
|
|
118
|
+
if config_mmproj:
|
|
119
|
+
result["mmproj"] = config_mmproj
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
# Try to resolve from local cache first to avoid unnecessary downloads
|
|
123
|
+
checkpoint, variant = parse_checkpoint(config_checkpoint)
|
|
124
|
+
local_result = resolve_local_gguf_model(checkpoint, variant, config_mmproj)
|
|
125
|
+
|
|
126
|
+
if local_result:
|
|
127
|
+
return local_result
|
|
128
|
+
|
|
129
|
+
# Not found locally - download from internet
|
|
106
130
|
return download_gguf(
|
|
107
131
|
config_checkpoint=config_checkpoint,
|
|
108
132
|
config_mmproj=config_mmproj,
|
|
@@ -135,6 +159,12 @@ class LlamaServer(WrappedServer):
|
|
|
135
159
|
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
136
160
|
exe_path = get_llama_server_exe_path(self.backend)
|
|
137
161
|
|
|
162
|
+
# For embedding models, use a larger context size to support longer individual
|
|
163
|
+
# strings. Embedding requests can include multiple strings in a batch, and each
|
|
164
|
+
# string needs to fit within the context window.
|
|
165
|
+
if supports_embeddings and ctx_size < EMBEDDING_CTX_SIZE:
|
|
166
|
+
ctx_size = EMBEDDING_CTX_SIZE
|
|
167
|
+
|
|
138
168
|
# Build the base command
|
|
139
169
|
base_command = [
|
|
140
170
|
exe_path,
|
|
@@ -180,7 +210,18 @@ class LlamaServer(WrappedServer):
|
|
|
180
210
|
|
|
181
211
|
# Add embeddings support if the model supports it
|
|
182
212
|
if supports_embeddings:
|
|
183
|
-
|
|
213
|
+
# For embedding models, set batch sizes to handle multiple documents in a single request
|
|
214
|
+
# batch-size: logical batch size (total tokens across all sequences)
|
|
215
|
+
# ubatch-size: physical batch size (tokens processed in a single forward pass)
|
|
216
|
+
base_command.extend(
|
|
217
|
+
[
|
|
218
|
+
"--embeddings",
|
|
219
|
+
"--batch-size",
|
|
220
|
+
str(EMBEDDING_BATCH_SIZE),
|
|
221
|
+
"--ubatch-size",
|
|
222
|
+
str(EMBEDDING_UBATCH_SIZE),
|
|
223
|
+
]
|
|
224
|
+
)
|
|
184
225
|
|
|
185
226
|
# Add reranking support if the model supports it
|
|
186
227
|
if supports_reranking:
|