lemonade-sdk 8.1.11__py3-none-any.whl → 8.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (38) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/common/status.py +4 -4
  3. lemonade/common/system_info.py +0 -26
  4. lemonade/tools/accuracy.py +143 -48
  5. lemonade/tools/adapter.py +6 -1
  6. lemonade/tools/bench.py +26 -8
  7. lemonade/tools/flm/utils.py +70 -22
  8. lemonade/tools/huggingface/bench.py +6 -1
  9. lemonade/tools/llamacpp/bench.py +146 -27
  10. lemonade/tools/llamacpp/load.py +30 -2
  11. lemonade/tools/llamacpp/utils.py +317 -21
  12. lemonade/tools/oga/bench.py +5 -26
  13. lemonade/tools/oga/load.py +49 -123
  14. lemonade/tools/oga/migration.py +403 -0
  15. lemonade/tools/report/table.py +76 -8
  16. lemonade/tools/server/flm.py +2 -6
  17. lemonade/tools/server/llamacpp.py +43 -2
  18. lemonade/tools/server/serve.py +354 -18
  19. lemonade/tools/server/static/js/chat.js +15 -77
  20. lemonade/tools/server/static/js/model-settings.js +24 -3
  21. lemonade/tools/server/static/js/models.js +440 -37
  22. lemonade/tools/server/static/js/shared.js +61 -8
  23. lemonade/tools/server/static/logs.html +157 -13
  24. lemonade/tools/server/static/styles.css +204 -0
  25. lemonade/tools/server/static/webapp.html +39 -1
  26. lemonade/version.py +1 -1
  27. lemonade_install/install.py +33 -579
  28. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +6 -4
  29. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/RECORD +38 -37
  30. lemonade_server/cli.py +10 -0
  31. lemonade_server/model_manager.py +172 -11
  32. lemonade_server/pydantic_models.py +3 -0
  33. lemonade_server/server_models.json +102 -66
  34. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
  35. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
  36. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
  37. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
  38. {lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
@@ -48,6 +48,18 @@ def _merge_join(str1, str2) -> str:
48
48
  return str1 + ("\n" if str1 and str2 else "") + str2
49
49
 
50
50
 
51
+ def _window_sum(data: list, n_windows: int) -> list:
52
+ """Sums data into n_windows windows"""
53
+ if n_windows <= 0:
54
+ return data
55
+ window_size = max(1, len(data) // n_windows)
56
+ summed_data = []
57
+ for i in range(0, len(data), window_size):
58
+ window_sum = sum(data[i : i + window_size])
59
+ summed_data.append(window_sum)
60
+ return summed_data
61
+
62
+
51
63
  ################################################################################
52
64
  # CLASSES THAT DESCRIBE TEXT TABLE COLUMNS
53
65
  ################################################################################
@@ -88,10 +100,54 @@ class SimpleStat(TableColumn):
88
100
  if lean and self.omit_if_lean:
89
101
  return None
90
102
  data = build_stats.get(self.stat, None)
91
- if data is None:
92
- return ""
103
+ if data is None or (data == []):
104
+ return "-"
93
105
  if self.stat_fn:
94
106
  data = self.stat_fn(data)
107
+ cell_str = "\n".join(
108
+ [
109
+ _wrap("-" if x is None else f"{x:{self.format_str}}", self.wrap)
110
+ for x in _to_list(data)
111
+ ]
112
+ )
113
+ return cell_str
114
+
115
+
116
+ class DependentStat(TableColumn):
117
+ """
118
+ These are for statistics already declared by the tool or basic build stats that
119
+ rely on one or more additional stats to compute their value. The dependency is
120
+ embodied by the stat_fn function.
121
+ """
122
+
123
+ def __init__(
124
+ self,
125
+ column_header,
126
+ stats,
127
+ format_str,
128
+ align="center",
129
+ omit_if_lean=False,
130
+ wrap=None,
131
+ stat_fn=None,
132
+ ):
133
+ self.column_header = column_header
134
+ self.stats = stats
135
+ self.format_str = format_str
136
+ self.align = align
137
+ self.omit_if_lean = omit_if_lean
138
+ self.wrap = wrap or self.default_wrap
139
+ self.stat_fn = stat_fn
140
+
141
+ def get_str(self, build_stats, lean=False):
142
+ if lean and self.omit_if_lean:
143
+ return None
144
+ stats_data = [build_stats.get(stat, None) for stat in self.stats]
145
+ if self.stat_fn:
146
+ data = self.stat_fn(stats_data)
147
+ else:
148
+ data = stats_data[0]
149
+ if data is None or (data == []):
150
+ return "-"
95
151
  cell_str = "\n".join(
96
152
  [_wrap(f"{x:{self.format_str}}", self.wrap) for x in _to_list(data)]
97
153
  )
@@ -434,10 +490,12 @@ class Table(ABC):
434
490
  row = []
435
491
 
436
492
  # First columns
493
+ first_columns_count = 0
437
494
  for entry in first_columns:
438
495
  entry_str = entry.get_str(build_stats, self.lean)
439
496
  if entry_str is not None:
440
497
  row.append(entry_str)
498
+ first_columns_count += 1
441
499
 
442
500
  # Per tool columns
443
501
  for tool in tools:
@@ -460,22 +518,24 @@ class Table(ABC):
460
518
  row.append(entry_str)
461
519
 
462
520
  # Final columns
521
+ last_columns_count = 0
463
522
  for entry in last_columns:
464
523
  entry_str = entry.get_str(build_stats, self.lean)
465
524
  if entry_str is not None:
466
525
  row.append(entry_str)
526
+ last_columns_count += 1
467
527
 
468
528
  # See if this row should be merged with the last row
469
529
  if last_build_stats and self.merge_test_fn(last_build_stats, build_stats):
470
530
  # Merge with last row
471
- for col in range(0, len(first_columns)):
531
+ for col in range(0, first_columns_count):
472
532
  # If identical, don't duplicate
473
533
  if last_row[col] != row[col]:
474
534
  last_row[col] = _merge_join(last_row[col], row[col])
475
- for col in range(len(first_columns), len(row) - len(last_columns)):
535
+ for col in range(first_columns_count, len(row) - last_columns_count):
476
536
  # Allow duplicates
477
537
  last_row[col] = _merge_join(last_row[col], row[col])
478
- for col in range(len(row) - len(last_columns), len(row)):
538
+ for col in range(len(row) - last_columns_count, len(row)):
479
539
  # If identical, don't duplicate
480
540
  if last_row[col] != row[col]:
481
541
  last_row[col] = _merge_join(last_row[col], row[col])
@@ -581,17 +641,25 @@ class LemonadePerfTable(Table):
581
641
  Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
582
642
  ".2f",
583
643
  ),
644
+ StatWithSD(
645
+ _wrap("Prefill Tokens per Second", 8),
646
+ Keys.PREFILL_TOKENS_PER_SECOND,
647
+ Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
648
+ ".2f",
649
+ ),
584
650
  StatWithSD(
585
651
  _wrap("Tokens per Second", 8),
586
652
  Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
587
653
  Keys.STD_DEV_TOKENS_PER_SECOND,
588
654
  ".2f",
589
655
  ),
590
- SimpleStat(
656
+ DependentStat(
591
657
  _wrap("Total Generated Tokens", 9),
592
- Keys.RESPONSE_TOKENS,
658
+ [Keys.RESPONSE_TOKENS, Keys.PROMPT_TOKENS],
593
659
  "d",
594
- stat_fn=lambda x: sum(_to_list(x)),
660
+ stat_fn=lambda x: _window_sum(
661
+ _to_list(x[0]), n_windows=len(_to_list(x[1]))
662
+ ),
595
663
  ),
596
664
  SimpleStat(
597
665
  _wrap("Memory Used (GB)", 8), Keys.MAX_MEMORY_USED_GBYTE, ".3f"
@@ -42,12 +42,6 @@ class FlmServer(WrappedServer):
42
42
  self.flm_model_name = None
43
43
  super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
44
44
 
45
- def _choose_port(self):
46
- """
47
- `flm serve` doesn't support port selection as of v0.9.10
48
- """
49
- self.port = 11434
50
-
51
45
  def address(self):
52
46
  return f"http://localhost:{self.port}/v1"
53
47
 
@@ -83,6 +77,8 @@ class FlmServer(WrappedServer):
83
77
  f"{self.flm_model_name}",
84
78
  "--ctx-len",
85
79
  str(ctx_size),
80
+ "--port",
81
+ str(self.port),
86
82
  ]
87
83
 
88
84
  # Set up environment with library path for Linux
@@ -15,9 +15,16 @@ from lemonade.tools.llamacpp.utils import (
15
15
  get_llama_server_exe_path,
16
16
  install_llamacpp,
17
17
  download_gguf,
18
+ resolve_local_gguf_model,
19
+ parse_checkpoint,
18
20
  )
19
21
  from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
20
22
 
23
+ # Embedding model batch configuration set to 8192 as default
24
+ EMBEDDING_CTX_SIZE = 8192
25
+ EMBEDDING_BATCH_SIZE = 8192
26
+ EMBEDDING_UBATCH_SIZE = 8192
27
+
21
28
 
22
29
  class LlamaTelemetry(WrappedServerTelemetry):
23
30
  """
@@ -101,8 +108,25 @@ class LlamaServer(WrappedServer):
101
108
  self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
102
109
  ) -> dict:
103
110
  """
104
- Download a model for the wrapper server
111
+ Download a model for the wrapper server.
112
+ First checks local cache, then downloads from internet if needed.
105
113
  """
114
+ # If it's a direct file path, just return it
115
+
116
+ if os.path.exists(config_checkpoint):
117
+ result = {"variant": config_checkpoint}
118
+ if config_mmproj:
119
+ result["mmproj"] = config_mmproj
120
+ return result
121
+
122
+ # Try to resolve from local cache first to avoid unnecessary downloads
123
+ checkpoint, variant = parse_checkpoint(config_checkpoint)
124
+ local_result = resolve_local_gguf_model(checkpoint, variant, config_mmproj)
125
+
126
+ if local_result:
127
+ return local_result
128
+
129
+ # Not found locally - download from internet
106
130
  return download_gguf(
107
131
  config_checkpoint=config_checkpoint,
108
132
  config_mmproj=config_mmproj,
@@ -135,6 +159,12 @@ class LlamaServer(WrappedServer):
135
159
  # Get the current executable path (handles both Windows and Ubuntu structures)
136
160
  exe_path = get_llama_server_exe_path(self.backend)
137
161
 
162
+ # For embedding models, use a larger context size to support longer individual
163
+ # strings. Embedding requests can include multiple strings in a batch, and each
164
+ # string needs to fit within the context window.
165
+ if supports_embeddings and ctx_size < EMBEDDING_CTX_SIZE:
166
+ ctx_size = EMBEDDING_CTX_SIZE
167
+
138
168
  # Build the base command
139
169
  base_command = [
140
170
  exe_path,
@@ -180,7 +210,18 @@ class LlamaServer(WrappedServer):
180
210
 
181
211
  # Add embeddings support if the model supports it
182
212
  if supports_embeddings:
183
- base_command.append("--embeddings")
213
+ # For embedding models, set batch sizes to handle multiple documents in a single request
214
+ # batch-size: logical batch size (total tokens across all sequences)
215
+ # ubatch-size: physical batch size (tokens processed in a single forward pass)
216
+ base_command.extend(
217
+ [
218
+ "--embeddings",
219
+ "--batch-size",
220
+ str(EMBEDDING_BATCH_SIZE),
221
+ "--ubatch-size",
222
+ str(EMBEDDING_UBATCH_SIZE),
223
+ ]
224
+ )
184
225
 
185
226
  # Add reranking support if the model supports it
186
227
  if supports_reranking: