lemonade-sdk 7.0.0__tar.gz → 7.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (72) hide show
  1. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/PKG-INFO +4 -7
  2. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/setup.py +4 -7
  3. lemonade_sdk-7.0.1/src/lemonade/tools/server/instructions.py +294 -0
  4. lemonade_sdk-7.0.1/src/lemonade/tools/server/llamacpp.py +289 -0
  5. lemonade_sdk-7.0.1/src/lemonade/tools/server/pydantic_models.py +83 -0
  6. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/server/serve.py +152 -146
  7. lemonade_sdk-7.0.1/src/lemonade/tools/server/static/styles.css +313 -0
  8. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/server/tool_calls.py +50 -43
  9. lemonade_sdk-7.0.1/src/lemonade/version.py +1 -0
  10. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_sdk.egg-info/PKG-INFO +4 -7
  11. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_sdk.egg-info/SOURCES.txt +4 -0
  12. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_sdk.egg-info/requires.txt +3 -6
  13. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_server/cli.py +4 -2
  14. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_server/model_manager.py +34 -17
  15. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_server/server_models.json +42 -0
  16. lemonade_sdk-7.0.0/src/lemonade/version.py +0 -1
  17. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/LICENSE +0 -0
  18. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/NOTICE.md +0 -0
  19. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/README.md +0 -0
  20. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/pyproject.toml +0 -0
  21. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/setup.cfg +0 -0
  22. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/__init__.py +0 -0
  23. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/api.py +0 -0
  24. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/cache.py +0 -0
  25. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/cli.py +0 -0
  26. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/__init__.py +0 -0
  27. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/analyze_model.py +0 -0
  28. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/build.py +0 -0
  29. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/cli_helpers.py +0 -0
  30. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/exceptions.py +0 -0
  31. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/filesystem.py +0 -0
  32. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/labels.py +0 -0
  33. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/onnx_helpers.py +0 -0
  34. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/plugins.py +0 -0
  35. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/printing.py +0 -0
  36. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/status.py +0 -0
  37. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/system_info.py +0 -0
  38. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/tensor_helpers.py +0 -0
  39. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/common/test_helpers.py +0 -0
  40. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/profilers/__init__.py +0 -0
  41. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/profilers/memory_tracker.py +0 -0
  42. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/profilers/profiler.py +0 -0
  43. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/sequence.py +0 -0
  44. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/state.py +0 -0
  45. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/__init__.py +0 -0
  46. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/adapter.py +0 -0
  47. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/bench.py +0 -0
  48. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/huggingface_bench.py +0 -0
  49. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/huggingface_load.py +0 -0
  50. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/humaneval.py +0 -0
  51. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/llamacpp.py +0 -0
  52. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/llamacpp_bench.py +0 -0
  53. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/management_tools.py +0 -0
  54. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/mmlu.py +0 -0
  55. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/ort_genai/__init__.py +0 -0
  56. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/ort_genai/oga.py +0 -0
  57. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/ort_genai/oga_bench.py +0 -0
  58. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/perplexity.py +0 -0
  59. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/prompt.py +0 -0
  60. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/quark/__init__.py +0 -0
  61. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/quark/quark_load.py +0 -0
  62. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/quark/quark_quantize.py +0 -0
  63. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/report/__init__.py +0 -0
  64. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/report/llm_report.py +0 -0
  65. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/report/table.py +0 -0
  66. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/server/__init__.py +0 -0
  67. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade/tools/tool.py +0 -0
  68. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_install/__init__.py +0 -0
  69. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_install/install.py +0 -0
  70. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  71. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  72. {lemonade_sdk-7.0.0 → lemonade_sdk-7.0.1}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 7.0.0
3
+ Version: 7.0.1
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.12
@@ -25,8 +25,8 @@ Requires-Dist: matplotlib
25
25
  Requires-Dist: tabulate
26
26
  Requires-Dist: huggingface-hub==0.30.2
27
27
  Provides-Extra: llm
28
- Requires-Dist: torch>=2.0.0; extra == "llm"
29
- Requires-Dist: transformers; extra == "llm"
28
+ Requires-Dist: torch>=2.6.0; extra == "llm"
29
+ Requires-Dist: transformers<=4.51.3; extra == "llm"
30
30
  Requires-Dist: accelerate; extra == "llm"
31
31
  Requires-Dist: py-cpuinfo; extra == "llm"
32
32
  Requires-Dist: sentencepiece; extra == "llm"
@@ -34,23 +34,20 @@ Requires-Dist: datasets; extra == "llm"
34
34
  Requires-Dist: human-eval-windows==1.0.4; extra == "llm"
35
35
  Requires-Dist: fastapi; extra == "llm"
36
36
  Requires-Dist: uvicorn[standard]; extra == "llm"
37
- Requires-Dist: openai>=1.66.0; extra == "llm"
37
+ Requires-Dist: openai>=1.81.0; extra == "llm"
38
38
  Requires-Dist: lm-eval[api]; extra == "llm"
39
39
  Provides-Extra: llm-oga-cpu
40
40
  Requires-Dist: onnxruntime-genai==0.6.0; extra == "llm-oga-cpu"
41
41
  Requires-Dist: onnxruntime<1.22.0,>=1.10.1; extra == "llm-oga-cpu"
42
- Requires-Dist: torch<2.4,>=2.0.0; extra == "llm-oga-cpu"
43
42
  Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-cpu"
44
43
  Provides-Extra: llm-oga-igpu
45
44
  Requires-Dist: onnxruntime-genai-directml==0.6.0; extra == "llm-oga-igpu"
46
45
  Requires-Dist: onnxruntime-directml<1.22.0,>=1.19.0; extra == "llm-oga-igpu"
47
- Requires-Dist: torch<2.4,>=2.0.0; extra == "llm-oga-igpu"
48
46
  Requires-Dist: transformers<4.45.0; extra == "llm-oga-igpu"
49
47
  Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-igpu"
50
48
  Provides-Extra: llm-oga-cuda
51
49
  Requires-Dist: onnxruntime-genai-cuda==0.6.0; extra == "llm-oga-cuda"
52
50
  Requires-Dist: onnxruntime-gpu<1.22.0,>=1.19.1; extra == "llm-oga-cuda"
53
- Requires-Dist: torch<2.4,>=2.0.0; extra == "llm-oga-cuda"
54
51
  Requires-Dist: transformers<4.45.0; extra == "llm-oga-cuda"
55
52
  Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-cuda"
56
53
  Provides-Extra: llm-oga-npu
@@ -3,7 +3,6 @@ from setuptools import setup
3
3
  with open("src/lemonade/version.py", encoding="utf-8") as fp:
4
4
  version = fp.read().split('"')[1]
5
5
 
6
-
7
6
  setup(
8
7
  name="lemonade-sdk",
9
8
  version=version,
@@ -46,8 +45,8 @@ setup(
46
45
  ],
47
46
  extras_require={
48
47
  "llm": [
49
- "torch>=2.0.0",
50
- "transformers",
48
+ "torch>=2.6.0",
49
+ "transformers<=4.51.3",
51
50
  "accelerate",
52
51
  "py-cpuinfo",
53
52
  "sentencepiece",
@@ -57,26 +56,23 @@ setup(
57
56
  "human-eval-windows==1.0.4",
58
57
  "fastapi",
59
58
  "uvicorn[standard]",
60
- "openai>=1.66.0",
59
+ "openai>=1.81.0",
61
60
  "lm-eval[api]",
62
61
  ],
63
62
  "llm-oga-cpu": [
64
63
  "onnxruntime-genai==0.6.0",
65
64
  "onnxruntime >=1.10.1,<1.22.0",
66
- "torch>=2.0.0,<2.4",
67
65
  "lemonade-sdk[llm]",
68
66
  ],
69
67
  "llm-oga-igpu": [
70
68
  "onnxruntime-genai-directml==0.6.0",
71
69
  "onnxruntime-directml>=1.19.0,<1.22.0",
72
- "torch>=2.0.0,<2.4",
73
70
  "transformers<4.45.0",
74
71
  "lemonade-sdk[llm]",
75
72
  ],
76
73
  "llm-oga-cuda": [
77
74
  "onnxruntime-genai-cuda==0.6.0",
78
75
  "onnxruntime-gpu >=1.19.1,<1.22.0",
79
- "torch>=2.0.0,<2.4",
80
76
  "transformers<4.45.0",
81
77
  "lemonade-sdk[llm]",
82
78
  ],
@@ -111,6 +107,7 @@ setup(
111
107
  include_package_data=True,
112
108
  package_data={
113
109
  "lemonade_server": ["server_models.json"],
110
+ "lemonade": ["tools/server/static/styles.css"],
114
111
  },
115
112
  )
116
113
 
@@ -0,0 +1,294 @@
1
+ from pathlib import Path
2
+ import json
3
+ from fastapi.responses import HTMLResponse
4
+ from lemonade_server.model_manager import ModelManager
5
+
6
+
7
+ def get_instructions_html(port=8000):
8
+ """
9
+ Show instructions on how to use the server.
10
+ """
11
+ # Load server models from JSON
12
+ server_models_path = (
13
+ Path(__file__).parent.parent.parent.parent
14
+ / "lemonade_server"
15
+ / "server_models.json"
16
+ )
17
+ with open(server_models_path, "r", encoding="utf-8") as f:
18
+ server_models = json.load(f)
19
+
20
+ # Use shared filter function from model_manager.py
21
+ filtered_models = ModelManager().filter_models_by_backend(server_models)
22
+
23
+ # Pass filtered server_models to JS
24
+ server_models_js = (
25
+ f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
26
+ )
27
+
28
+ # New lemon-themed HTML structure
29
+ # pylint: disable=W1401
30
+ styled_html = f"""
31
+ <!DOCTYPE html>
32
+ <html lang=\"en\">
33
+ <head>
34
+ <meta charset=\"UTF-8\">
35
+ <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
36
+ <title>Lemonade Server</title>
37
+ <link rel="icon" href="data:,">
38
+ <link rel=\"stylesheet\" href=\"/static/styles.css\">
39
+ <script>
40
+ window.SERVER_PORT = {port};
41
+ </script>
42
+ {server_models_js}
43
+ </head>
44
+ <body>
45
+ <nav class=\"navbar\">
46
+ <a href=\"https://github.com/lemonade-sdk/lemonade\">GitHub</a>
47
+ <a href=\"https://lemonade-server.ai/docs/\">Docs</a>
48
+ <a href=\"https://lemonade-server.ai/docs/server/server_models/\">Models</a>
49
+ <a href=\"https://lemonade-server.ai/docs/server/apps/\">Featured Apps</a>
50
+ </nav>
51
+ <main class=\"main\">
52
+ <div class=\"title\">🍋 Lemonade Server</div>
53
+ <div class=\"tab-container\">
54
+ <div class=\"tabs\">
55
+ <button class=\"tab active\" id=\"tab-chat\" onclick=\"showTab('chat')\">LLM Chat</button>
56
+ <button class=\"tab\" id=\"tab-models\" onclick=\"showTab('models')\">Model Management</button>
57
+ </div>
58
+ <div class=\"tab-content active\" id=\"content-chat\">
59
+ <div class=\"chat-container\">
60
+ <div class=\"chat-history\" id=\"chat-history\"></div>
61
+ <div class=\"chat-input-row\">
62
+ <select id=\"model-select\"></select>
63
+ <input type=\"text\" id=\"chat-input\" placeholder=\"Type your message...\" />
64
+ <button id=\"send-btn\">Send</button>
65
+ </div>
66
+ </div>
67
+ </div>
68
+ <div class=\"tab-content\" id=\"content-models\">
69
+ <div class=\"model-mgmt-container\">
70
+ <div class=\"model-mgmt-pane\">
71
+ <h3>Installed Models</h3>
72
+ <table class=\"model-table\" id=\"installed-models-table\">
73
+ <colgroup><col style=\"width:100%\"></colgroup>
74
+ <tbody id=\"installed-models-tbody\"></tbody>
75
+ </table>
76
+ </div>
77
+ <div class=\"model-mgmt-pane\">
78
+ <h3>Suggested Models</h3>
79
+ <table class=\"model-table\" id=\"suggested-models-table\">
80
+ <tbody id=\"suggested-models-tbody\"></tbody>
81
+ </table>
82
+ </div>
83
+ </div>
84
+ </div>
85
+ </div>
86
+ </main>
87
+ <footer class=\"site-footer\">
88
+ <div class=\"dad-joke\">When life gives you LLMs, make an LLM aide.</div>
89
+ <div class=\"copyright\">Copyright 2025 AMD</div>
90
+ </footer>
91
+ <script src=\"https://cdn.jsdelivr.net/npm/openai@4.21.0/dist/openai.min.js\"></script>
92
+ <script>
93
+ // Tab switching logic
94
+ function showTab(tab) {{
95
+ document.getElementById('tab-chat').classList.remove('active');
96
+ document.getElementById('tab-models').classList.remove('active');
97
+ document.getElementById('content-chat').classList.remove('active');
98
+ document.getElementById('content-models').classList.remove('active');
99
+ if (tab === 'chat') {{
100
+ document.getElementById('tab-chat').classList.add('active');
101
+ document.getElementById('content-chat').classList.add('active');
102
+ }} else {{
103
+ document.getElementById('tab-models').classList.add('active');
104
+ document.getElementById('content-models').classList.add('active');
105
+ }}
106
+ }}
107
+
108
+ // Helper to get server base URL
109
+ function getServerBaseUrl() {{
110
+ const port = window.SERVER_PORT || 8000;
111
+ return `http://localhost:{port}`;
112
+ }}
113
+
114
+ // Populate model dropdown from /api/v1/models endpoint
115
+ async function loadModels() {{
116
+ try {{
117
+ const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
118
+ const data = await resp.json();
119
+ const select = document.getElementById('model-select');
120
+ select.innerHTML = '';
121
+ if (!data.data || !Array.isArray(data.data)) {{
122
+ select.innerHTML = '<option>No models found (malformed response)</option>';
123
+ return;
124
+ }}
125
+ if (data.data.length === 0) {{
126
+ select.innerHTML = '<option>No models available</option>';
127
+ return;
128
+ }}
129
+ let defaultIndex = 0;
130
+ data.data.forEach(function(model, index) {{
131
+ const modelId = model.id || model.name || model;
132
+ const opt = document.createElement('option');
133
+ opt.value = modelId;
134
+ opt.textContent = modelId;
135
+ if (modelId === 'Llama-3.2-1B-Instruct-Hybrid') {{
136
+ defaultIndex = index;
137
+ }}
138
+ select.appendChild(opt);
139
+ }});
140
+ select.selectedIndex = defaultIndex;
141
+ }} catch (e) {{
142
+ const select = document.getElementById('model-select');
143
+ select.innerHTML = `<option>Error loading models: ${{e.message}}</option>`;
144
+ console.error('Error loading models:', e);
145
+ }}
146
+ }}
147
+ loadModels();
148
+
149
+ // Model Management Tab Logic
150
+ async function refreshModelMgmtUI() {{
151
+ // Get installed models from /api/v1/models
152
+ let installed = [];
153
+ try {{
154
+ const resp = await fetch(getServerBaseUrl() + '/api/v1/models');
155
+ const data = await resp.json();
156
+ if (data.data && Array.isArray(data.data)) {{
157
+ installed = data.data.map(m => m.id || m.name || m);
158
+ }}
159
+ }} catch (e) {{}}
160
+ // All models from server_models.json (window.SERVER_MODELS)
161
+ const allModels = window.SERVER_MODELS || {{}};
162
+ // Filter suggested models not installed
163
+ const suggested = Object.keys(allModels).filter(
164
+ k => allModels[k].suggested && !installed.includes(k)
165
+ );
166
+ // Render installed models as a table (two columns, second is invisible)
167
+ const installedTbody = document.getElementById('installed-models-tbody');
168
+ installedTbody.innerHTML = '';
169
+ installed.forEach(function(mid) {{
170
+ var tr = document.createElement('tr');
171
+ var tdName = document.createElement('td');
172
+ tdName.textContent = mid;
173
+ var tdEmpty = document.createElement('td');
174
+ tdEmpty.style.width = '0';
175
+ tdEmpty.style.padding = '0';
176
+ tdEmpty.style.border = 'none';
177
+ tr.appendChild(tdName);
178
+ tr.appendChild(tdEmpty);
179
+ installedTbody.appendChild(tr);
180
+ }});
181
+ // Render suggested models as a table
182
+ const suggestedTbody = document.getElementById('suggested-models-tbody');
183
+ suggestedTbody.innerHTML = '';
184
+ suggested.forEach(mid => {{
185
+ const tr = document.createElement('tr');
186
+ const tdName = document.createElement('td');
187
+ tdName.textContent = mid;
188
+ tdName.style.paddingRight = '1em';
189
+ tdName.style.verticalAlign = 'middle';
190
+ const tdBtn = document.createElement('td');
191
+ tdBtn.style.width = '1%';
192
+ tdBtn.style.verticalAlign = 'middle';
193
+ const btn = document.createElement('button');
194
+ btn.textContent = '+';
195
+ btn.title = 'Install model';
196
+ btn.onclick = async function() {{
197
+ btn.disabled = true;
198
+ btn.textContent = 'Installing...';
199
+ btn.classList.add('installing-btn');
200
+ try {{
201
+ await fetch(getServerBaseUrl() + '/api/v1/pull', {{
202
+ method: 'POST',
203
+ headers: {{ 'Content-Type': 'application/json' }},
204
+ body: JSON.stringify({{ model_name: mid }})
205
+ }});
206
+ await refreshModelMgmtUI();
207
+ await loadModels(); // update chat dropdown too
208
+ }} catch (e) {{
209
+ btn.textContent = 'Error';
210
+ }}
211
+ }};
212
+ tdBtn.appendChild(btn);
213
+ tr.appendChild(tdName);
214
+ tr.appendChild(tdBtn);
215
+ suggestedTbody.appendChild(tr);
216
+ }});
217
+ }}
218
+ // Initial load
219
+ refreshModelMgmtUI();
220
+ // Optionally, refresh when switching to the tab
221
+ document.getElementById('tab-models').addEventListener('click', refreshModelMgmtUI);
222
+
223
+ // Chat logic (streaming with OpenAI JS client placeholder)
224
+ const chatHistory = document.getElementById('chat-history');
225
+ const chatInput = document.getElementById('chat-input');
226
+ const sendBtn = document.getElementById('send-btn');
227
+ const modelSelect = document.getElementById('model-select');
228
+ let messages = [];
229
+
230
+ function appendMessage(role, text) {{
231
+ const div = document.createElement('div');
232
+ div.className = 'chat-message ' + role;
233
+ // Add a bubble for iMessage style
234
+ const bubble = document.createElement('div');
235
+ bubble.className = 'chat-bubble ' + role;
236
+ bubble.innerHTML = text;
237
+ div.appendChild(bubble);
238
+ chatHistory.appendChild(div);
239
+ chatHistory.scrollTop = chatHistory.scrollHeight;
240
+ }}
241
+
242
+ async function sendMessage() {{
243
+ const text = chatInput.value.trim();
244
+ if (!text) return;
245
+ appendMessage('user', text);
246
+ messages.push({{ role: 'user', content: text }});
247
+ chatInput.value = '';
248
+ sendBtn.disabled = true;
249
+ // Streaming OpenAI completions (placeholder, adapt as needed)
250
+ let llmText = '';
251
+ appendMessage('llm', '...');
252
+ const llmDiv = chatHistory.lastChild.querySelector('.chat-bubble.llm');
253
+ try {{
254
+ // Use the correct endpoint for chat completions
255
+ const resp = await fetch(getServerBaseUrl() + '/api/v1/chat/completions', {{
256
+ method: 'POST',
257
+ headers: {{ 'Content-Type': 'application/json' }},
258
+ body: JSON.stringify({{
259
+ model: modelSelect.value,
260
+ messages: messages,
261
+ stream: true
262
+ }})
263
+ }});
264
+ if (!resp.body) throw new Error('No stream');
265
+ const reader = resp.body.getReader();
266
+ let decoder = new TextDecoder();
267
+ llmDiv.textContent = '';
268
+ while (true) {{
269
+ const {{ done, value }} = await reader.read();
270
+ if (done) break;
271
+ const chunk = decoder.decode(value);
272
+ if (chunk.trim() === 'data: [DONE]' || chunk.trim() === '[DONE]') continue;
273
+ // Try to extract the content from the OpenAI chunk
274
+ const match = chunk.match(/"content"\s*:\s*"([^"]*)"/);
275
+ if (match && match[1]) {{
276
+ llmText += match[1];
277
+ llmDiv.textContent = llmText;
278
+ }}
279
+ }}
280
+ messages.push({{ role: 'assistant', content: llmText }});
281
+ }} catch (e) {{
282
+ llmDiv.textContent = '[Error: ' + e.message + ']';
283
+ }}
284
+ sendBtn.disabled = false;
285
+ }}
286
+ sendBtn.onclick = sendMessage;
287
+ chatInput.addEventListener('keydown', function(e) {{
288
+ if (e.key === 'Enter') sendMessage();
289
+ }});
290
+ </script>
291
+ </body>
292
+ </html>
293
+ """
294
+ return HTMLResponse(content=styled_html)
@@ -0,0 +1,289 @@
1
+ import sys
2
+ import os
3
+ import logging
4
+ import time
5
+ import subprocess
6
+ import zipfile
7
+ import re
8
+ import threading
9
+
10
+ import requests
11
+ from tabulate import tabulate
12
+ from fastapi import HTTPException, status
13
+ from fastapi.responses import StreamingResponse
14
+
15
+ from openai import OpenAI
16
+
17
+ from lemonade_server.model_manager import ModelManager
18
+ from lemonade.tools.server.pydantic_models import ChatCompletionRequest
19
+
20
+ LLAMA_VERSION = "b5543"
21
+ LLAMA_SERVER_PORT = "8081"
22
+
23
+ LLAMA_SERVER_EXE_DIR = os.path.join(
24
+ os.path.dirname(sys.executable),
25
+ "llama_server",
26
+ )
27
+
28
+ LLAMA_SERVER_EXE_PATH = os.path.join(
29
+ LLAMA_SERVER_EXE_DIR,
30
+ "llama-server.exe",
31
+ )
32
+
33
+
34
+ class LlamaTelemetry:
35
+ """
36
+ Manages telemetry data collection and display for llama server.
37
+ """
38
+
39
+ def __init__(self):
40
+ self.input_tokens = None
41
+ self.output_tokens = None
42
+ self.time_to_first_token = None
43
+ self.tokens_per_second = None
44
+ self.prompt_eval_time = None
45
+ self.eval_time = None
46
+
47
+ def parse_telemetry_line(self, line: str):
48
+ """
49
+ Parse telemetry data from llama server output lines.
50
+ """
51
+
52
+ # Parse prompt evaluation line
53
+ prompt_match = re.search(
54
+ # pylint: disable=C0301
55
+ r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
56
+ line,
57
+ )
58
+ if prompt_match:
59
+ prompt_time_ms = float(prompt_match.group(1))
60
+ input_tokens = int(prompt_match.group(2))
61
+
62
+ self.prompt_eval_time = prompt_time_ms / 1000.0
63
+ self.input_tokens = input_tokens
64
+ self.time_to_first_token = prompt_time_ms / 1000.0
65
+ return
66
+
67
+ # Parse generation evaluation line
68
+ eval_match = re.search(
69
+ r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
70
+ line,
71
+ )
72
+ if eval_match:
73
+ eval_time_ms = float(eval_match.group(1))
74
+ output_tokens = int(eval_match.group(2))
75
+ tokens_per_second = float(eval_match.group(3))
76
+
77
+ self.eval_time = eval_time_ms / 1000.0
78
+ self.output_tokens = output_tokens
79
+ self.tokens_per_second = tokens_per_second
80
+ return
81
+
82
+ def get_telemetry_data(self):
83
+ return {
84
+ "input_tokens": self.input_tokens,
85
+ "output_tokens": self.output_tokens,
86
+ "time_to_first_token": self.time_to_first_token,
87
+ "tokens_per_second": self.tokens_per_second,
88
+ "decode_token_times": None,
89
+ }
90
+
91
+ def show_telemetry(self):
92
+ # Check if debug logging is enabled
93
+ if not logging.getLogger().isEnabledFor(logging.DEBUG):
94
+ return
95
+
96
+ # Prepare telemetry data (transposed format)
97
+ telemetry = [
98
+ ["Input tokens", self.input_tokens],
99
+ ["Output tokens", self.output_tokens],
100
+ ["TTFT (s)", f"{self.time_to_first_token:.2f}"],
101
+ ["TPS", f"{self.tokens_per_second:.2f}"],
102
+ ]
103
+
104
+ table = tabulate(
105
+ telemetry, headers=["Metric", "Value"], tablefmt="fancy_grid"
106
+ ).split("\n")
107
+
108
+ # Show telemetry in debug while complying with uvicorn's log indentation
109
+ logging.debug("\n ".join(table))
110
+
111
+
112
+ def _log_subprocess_output(
113
+ process: subprocess.Popen, prefix: str, telemetry: LlamaTelemetry
114
+ ):
115
+ """
116
+ Read subprocess output line by line, log to debug, and parse telemetry
117
+ """
118
+
119
+ if process.stdout:
120
+ for line in iter(process.stdout.readline, ""):
121
+ if line:
122
+ line_stripped = line.strip()
123
+ logging.debug("%s: %s", prefix, line_stripped)
124
+
125
+ telemetry.parse_telemetry_line(line_stripped)
126
+
127
+ if process.poll() is not None:
128
+ break
129
+
130
+
131
+ def _wait_for_load(llama_server_process: subprocess.Popen, fail_message: str):
132
+ status_code = None
133
+ while not llama_server_process.poll() and status_code != 200:
134
+ health_url = f"http://localhost:{LLAMA_SERVER_PORT}/health"
135
+ try:
136
+ health_response = requests.get(health_url)
137
+ except requests.exceptions.ConnectionError:
138
+ logging.warning(fail_message)
139
+ else:
140
+ status_code = health_response.status_code
141
+ logging.debug(
142
+ "Testing llama-server readiness (will retry until ready), "
143
+ f"result: {health_response.json()}"
144
+ )
145
+ time.sleep(1)
146
+
147
+
148
+ def _launch_llama_subprocess(
149
+ model_path: str, use_gpu: bool, telemetry: LlamaTelemetry
150
+ ) -> subprocess.Popen:
151
+ """
152
+ Launch llama server subprocess with GPU or CPU configuration
153
+ """
154
+
155
+ base_command = [
156
+ LLAMA_SERVER_EXE_PATH,
157
+ "-m",
158
+ model_path,
159
+ "--port",
160
+ LLAMA_SERVER_PORT,
161
+ ]
162
+
163
+ # Configure GPU layers: 99 for GPU, 0 for CPU-only
164
+ ngl_value = "99" if use_gpu else "0"
165
+ command = base_command + ["-ngl", ngl_value]
166
+
167
+ # Start subprocess with output capture
168
+ process = subprocess.Popen(
169
+ command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
170
+ )
171
+
172
+ # Start background thread to log subprocess output
173
+ device_type = "GPU" if use_gpu else "CPU"
174
+ threading.Thread(
175
+ target=_log_subprocess_output,
176
+ args=(process, f"LLAMA SERVER {device_type}", telemetry),
177
+ daemon=True,
178
+ ).start()
179
+
180
+ return process
181
+
182
+
183
+ def server_load(checkpoint: str, model_reference: str, telemetry: LlamaTelemetry):
184
+ # Download llama.cpp server if it isn't already available
185
+ if not os.path.exists(LLAMA_SERVER_EXE_DIR):
186
+ # Download llama.cpp server zip
187
+ # pylint: disable=C0301
188
+ llama_zip_url = f"https://github.com/ggml-org/llama.cpp/releases/download/{LLAMA_VERSION}/llama-{LLAMA_VERSION}-bin-win-vulkan-x64.zip"
189
+ llama_zip_path = os.path.join(
190
+ os.path.dirname(sys.executable), "llama-server.zip"
191
+ )
192
+ logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
193
+
194
+ with requests.get(llama_zip_url, stream=True) as r:
195
+ r.raise_for_status()
196
+ with open(llama_zip_path, "wb") as f:
197
+ for chunk in r.iter_content(chunk_size=8192):
198
+ f.write(chunk)
199
+
200
+ # Extract zip
201
+ logging.info(f"Extracting {llama_zip_path} to {LLAMA_SERVER_EXE_DIR}")
202
+ with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
203
+ zip_ref.extractall(LLAMA_SERVER_EXE_DIR)
204
+
205
+ # Save version.txt
206
+ version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
207
+ with open(version_txt_path, "w", encoding="utf-8") as vf:
208
+ vf.write(LLAMA_VERSION)
209
+
210
+ # Delete zip file
211
+ os.remove(llama_zip_path)
212
+ logging.info("Cleaned up zip file")
213
+
214
+ # Download the gguf to the hugging face cache
215
+ snapshot_path = ModelManager().download_gguf(checkpoint)
216
+ model_path = os.path.join(snapshot_path, os.listdir(snapshot_path)[0])
217
+ logging.debug(f"GGUF file path: {model_path}")
218
+
219
+ # Start the llama-serve.exe process
220
+ logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
221
+
222
+ # Attempt loading on GPU first
223
+ llama_server_process = _launch_llama_subprocess(
224
+ model_path, use_gpu=True, telemetry=telemetry
225
+ )
226
+
227
+ # Check the /health endpoint until GPU server is ready
228
+ _wait_for_load(
229
+ llama_server_process,
230
+ f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
231
+ )
232
+
233
+ # If loading on GPU failed, try loading on CPU
234
+ if llama_server_process.poll():
235
+ llama_server_process = _launch_llama_subprocess(
236
+ model_path, use_gpu=False, telemetry=telemetry
237
+ )
238
+
239
+ # Check the /health endpoint until CPU server is ready
240
+ _wait_for_load(
241
+ llama_server_process,
242
+ f"Loading {model_reference} on CPU didn't work",
243
+ )
244
+
245
+ if llama_server_process.poll():
246
+ raise HTTPException(
247
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
248
+ detail=f"Failed to load {model_reference} with llama.cpp",
249
+ )
250
+
251
+ return llama_server_process
252
+
253
+
254
+ def chat_completion(
255
+ chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
256
+ ):
257
+ base_url = f"http://127.0.0.1:{LLAMA_SERVER_PORT}/v1"
258
+ client = OpenAI(
259
+ base_url=base_url,
260
+ api_key="lemonade",
261
+ )
262
+
263
+ # Convert Pydantic model to dict and remove unset/null values
264
+ request_dict = chat_completion_request.model_dump(
265
+ exclude_unset=True, exclude_none=True
266
+ )
267
+
268
+ def event_stream():
269
+ try:
270
+ # Enable streaming
271
+ request_dict["stream"] = True
272
+ for chunk in client.chat.completions.create(**request_dict):
273
+ yield f"data: {chunk.model_dump_json()}\n\n"
274
+ yield "data: [DONE]\n\n"
275
+
276
+ # Show telemetry after completion
277
+ telemetry.show_telemetry()
278
+
279
+ except Exception as e: # pylint: disable=broad-exception-caught
280
+ yield f'data: {{"error": "{str(e)}"}}\n\n'
281
+
282
+ return StreamingResponse(
283
+ event_stream(),
284
+ media_type="text/event-stream",
285
+ headers={
286
+ "Cache-Control": "no-cache",
287
+ "Connection": "keep-alive",
288
+ },
289
+ )