lemonade-sdk 8.1.2__tar.gz → 8.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (84) hide show
  1. {lemonade_sdk-8.1.2/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.3}/PKG-INFO +7 -6
  2. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/README.md +1 -1
  3. lemonade_sdk-8.1.3/pyproject.toml +8 -0
  4. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/setup.py +7 -7
  5. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/utils.py +54 -33
  6. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/llamacpp.py +96 -4
  7. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/serve.py +74 -8
  8. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/chat.js +735 -0
  9. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/model-settings.js +162 -0
  10. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/models.js +865 -0
  11. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/js/shared.js +491 -0
  12. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/static/styles.css +652 -26
  13. lemonade_sdk-8.1.3/src/lemonade/tools/server/static/webapp.html +257 -0
  14. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/port.py +3 -2
  15. lemonade_sdk-8.1.3/src/lemonade/version.py +1 -0
  16. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3/src/lemonade_sdk.egg-info}/PKG-INFO +7 -6
  17. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/SOURCES.txt +5 -0
  18. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/requires.txt +7 -5
  19. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_server/cli.py +31 -17
  20. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_server/pydantic_models.py +15 -3
  21. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_server/server_models.json +9 -3
  22. lemonade_sdk-8.1.2/src/lemonade/tools/server/static/webapp.html +0 -1204
  23. lemonade_sdk-8.1.2/src/lemonade/version.py +0 -1
  24. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/LICENSE +0 -0
  25. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/NOTICE.md +0 -0
  26. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/setup.cfg +0 -0
  27. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/__init__.py +0 -0
  28. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/api.py +0 -0
  29. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/cache.py +0 -0
  30. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/cli.py +0 -0
  31. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/__init__.py +0 -0
  32. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/build.py +0 -0
  33. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/cli_helpers.py +0 -0
  34. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/exceptions.py +0 -0
  35. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/filesystem.py +0 -0
  36. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/inference_engines.py +0 -0
  37. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/network.py +0 -0
  38. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/printing.py +0 -0
  39. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/status.py +0 -0
  40. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/system_info.py +0 -0
  41. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/common/test_helpers.py +0 -0
  42. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/profilers/__init__.py +0 -0
  43. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/profilers/memory_tracker.py +0 -0
  44. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/profilers/profiler.py +0 -0
  45. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/sequence.py +0 -0
  46. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/state.py +0 -0
  47. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/__init__.py +0 -0
  48. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/accuracy.py +0 -0
  49. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/adapter.py +0 -0
  50. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/bench.py +0 -0
  51. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/bench.py +0 -0
  52. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/load.py +0 -0
  53. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/huggingface/utils.py +0 -0
  54. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/humaneval.py +0 -0
  55. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/bench.py +0 -0
  56. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/load.py +0 -0
  57. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/llamacpp/utils.py +0 -0
  58. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/management_tools.py +0 -0
  59. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/mmlu.py +0 -0
  60. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/__init__.py +0 -0
  61. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/bench.py +0 -0
  62. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/oga/load.py +0 -0
  63. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/perplexity.py +0 -0
  64. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/prompt.py +0 -0
  65. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/__init__.py +0 -0
  66. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/quark_load.py +0 -0
  67. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/quark/quark_quantize.py +0 -0
  68. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/__init__.py +0 -0
  69. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/llm_report.py +0 -0
  70. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/report/table.py +0 -0
  71. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/__init__.py +0 -0
  72. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/static/favicon.ico +0 -0
  73. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/tool_calls.py +0 -0
  74. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/tray.py +0 -0
  75. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/system_tray.py +0 -0
  76. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/utils/thread.py +0 -0
  77. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/server/webapp.py +0 -0
  78. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade/tools/tool.py +0 -0
  79. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_install/__init__.py +0 -0
  80. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_install/install.py +0 -0
  81. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  82. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  83. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
  84. {lemonade_sdk-8.1.2 → lemonade_sdk-8.1.3}/src/lemonade_server/model_manager.py +0 -0
@@ -1,18 +1,18 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.1.2
3
+ Version: 8.1.3
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
- Requires-Python: >=3.10, <3.13
6
+ Requires-Python: >=3.10, <3.14
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  License-File: NOTICE.md
10
10
  Requires-Dist: invoke>=2.0.0
11
- Requires-Dist: onnx<1.18.0,>=1.11.0
11
+ Requires-Dist: onnx==1.18.0
12
12
  Requires-Dist: pyyaml>=5.4
13
13
  Requires-Dist: typeguard>=2.3.13
14
14
  Requires-Dist: packaging>=20.9
15
- Requires-Dist: numpy<2.0.0
15
+ Requires-Dist: numpy
16
16
  Requires-Dist: fasteners
17
17
  Requires-Dist: GitPython>=3.1.40
18
18
  Requires-Dist: psutil>=6.1.1
@@ -41,9 +41,10 @@ Requires-Dist: accelerate; extra == "dev"
41
41
  Requires-Dist: datasets; extra == "dev"
42
42
  Requires-Dist: pandas>=1.5.3; extra == "dev"
43
43
  Requires-Dist: matplotlib; extra == "dev"
44
- Requires-Dist: model-generate==1.5.0; (platform_system == "Windows" and python_version == "3.10") and extra == "dev"
45
44
  Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
46
45
  Requires-Dist: lm-eval[api]; extra == "dev"
46
+ Provides-Extra: model-generate
47
+ Requires-Dist: model-generate==1.5.0; (platform_system == "Windows" and python_version == "3.10") and extra == "model-generate"
47
48
  Provides-Extra: oga-hybrid
48
49
  Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-hybrid"
49
50
  Provides-Extra: oga-unified
@@ -105,7 +106,7 @@ Dynamic: summary
105
106
  <img src="https://img.shields.io/badge/Ubuntu-24.04%20%7C%2025.04-E95420?logo=ubuntu&logoColor=white" alt="Ubuntu 24.04 | 25.04" />
106
107
  </a>
107
108
  <a href="docs/README.md#installation" title="Check out our instructions">
108
- <img src="https://img.shields.io/badge/Python-3.10%20%7C%203.12-blue?logo=python&logoColor=white" alt="Made with Python" />
109
+ <img src="https://img.shields.io/badge/Python-3.10--3.13-blue?logo=python&logoColor=white" alt="Made with Python" />
109
110
  </a>
110
111
  <a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/contribute.md" title="Contribution Guide">
111
112
  <img src="https://img.shields.io/badge/PRs-welcome-brightgreen.svg" alt="PRs Welcome" />
@@ -14,7 +14,7 @@
14
14
  <img src="https://img.shields.io/badge/Ubuntu-24.04%20%7C%2025.04-E95420?logo=ubuntu&logoColor=white" alt="Ubuntu 24.04 | 25.04" />
15
15
  </a>
16
16
  <a href="docs/README.md#installation" title="Check out our instructions">
17
- <img src="https://img.shields.io/badge/Python-3.10%20%7C%203.12-blue?logo=python&logoColor=white" alt="Made with Python" />
17
+ <img src="https://img.shields.io/badge/Python-3.10--3.13-blue?logo=python&logoColor=white" alt="Made with Python" />
18
18
  </a>
19
19
  <a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/contribute.md" title="Contribution Guide">
20
20
  <img src="https://img.shields.io/badge/PRs-welcome-brightgreen.svg" alt="PRs Welcome" />
@@ -0,0 +1,8 @@
1
+ [build-system]
2
+ requires = [
3
+ "setuptools>=68",
4
+ "wheel"
5
+ ]
6
+ build-backend = "setuptools.build_meta"
7
+
8
+
@@ -28,13 +28,11 @@ setup(
28
28
  # Minimal dependencies required for end-users who are running
29
29
  # apps deployed on Lemonade SDK
30
30
  "invoke>=2.0.0",
31
- "onnx>=1.11.0,<1.18.0",
31
+ "onnx==1.18.0",
32
32
  "pyyaml>=5.4",
33
33
  "typeguard>=2.3.13",
34
34
  "packaging>=20.9",
35
- # Necessary until upstream packages account for the breaking
36
- # change to numpy
37
- "numpy<2.0.0",
35
+ "numpy",
38
36
  "fasteners",
39
37
  "GitPython>=3.1.40",
40
38
  "psutil>=6.1.1",
@@ -74,12 +72,14 @@ setup(
74
72
  "datasets",
75
73
  "pandas>=1.5.3",
76
74
  "matplotlib",
77
- "model-generate==1.5.0; platform_system=='Windows' and python_version=='3.10'",
78
75
  # Install human-eval from a forked repo with Windows support until the
79
76
  # PR (https://github.com/openai/human-eval/pull/53) is merged
80
77
  "human-eval-windows==1.0.4",
81
78
  "lm-eval[api]",
82
79
  ],
80
+ "model-generate": [
81
+ "model-generate==1.5.0; platform_system=='Windows' and python_version=='3.10'",
82
+ ],
83
83
  # Keep backwards compatibility for old extras names
84
84
  "oga-hybrid": ["lemonade-sdk[oga-ryzenai]"],
85
85
  "oga-unified": ["lemonade-sdk[oga-ryzenai]"],
@@ -128,13 +128,13 @@ setup(
128
128
  "lsdev=lemonade_server.cli:developer_entrypoint",
129
129
  ]
130
130
  },
131
- python_requires=">=3.10, <3.13",
131
+ python_requires=">=3.10, <3.14",
132
132
  long_description=open("README.md", "r", encoding="utf-8").read(),
133
133
  long_description_content_type="text/markdown",
134
134
  include_package_data=True,
135
135
  package_data={
136
136
  "lemonade_server": ["server_models.json"],
137
- "lemonade": ["tools/server/static/*"],
137
+ "lemonade": ["tools/server/static/**/*"],
138
138
  },
139
139
  )
140
140
 
@@ -100,9 +100,10 @@ class OrtGenaiModel(ModelAdapter):
100
100
  max_new_tokens=512,
101
101
  min_new_tokens=0,
102
102
  do_sample=True,
103
- top_k=50,
104
- top_p=1.0,
105
- temperature=0.7,
103
+ top_k=None,
104
+ top_p=None,
105
+ temperature=None,
106
+ repeat_penalty=None,
106
107
  streamer: OrtGenaiStreamer = None,
107
108
  pad_token_id=None,
108
109
  stopping_criteria=None,
@@ -154,38 +155,58 @@ class OrtGenaiModel(ModelAdapter):
154
155
  if random_seed is None:
155
156
  random_seed = -1 # In og.Generator, -1 = seed with random device
156
157
 
158
+ # Get search config if available, otherwise use empty dict
159
+ # Thanks to the empty dict, if the model doesn't have a built-in search
160
+ # config, the .get() calls will all just use the default values
161
+ search_config = {}
157
162
  if self.config and "search" in self.config:
158
163
  search_config = self.config["search"]
159
- params.set_search_options(
160
- do_sample=search_config.get("do_sample", do_sample),
161
- top_k=search_config.get("top_k", top_k),
162
- top_p=search_config.get("top_p", top_p),
163
- temperature=search_config.get("temperature", temperature),
164
- max_length=max_length_to_use,
165
- min_length=min_length,
166
- early_stopping=search_config.get("early_stopping", False),
167
- length_penalty=search_config.get("length_penalty", 1.0),
168
- num_beams=search_config.get("num_beams", 1),
169
- num_return_sequences=search_config.get("num_return_sequences", 1),
170
- repetition_penalty=search_config.get("repetition_penalty", 1.0),
171
- past_present_share_buffer=search_config.get(
172
- "past_present_share_buffer", True
173
- ),
174
- random_seed=random_seed,
175
- # Not currently supported by OGA
176
- # diversity_penalty=search_config.get('diversity_penalty', 0.0),
177
- # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
178
- )
179
- else:
180
- params.set_search_options(
181
- do_sample=do_sample,
182
- top_k=top_k,
183
- top_p=top_p,
184
- temperature=temperature,
185
- max_length=max_length_to_use,
186
- min_length=min_length,
187
- random_seed=random_seed,
188
- )
164
+
165
+ # Apply parameter hierarchy: user provided > search config > defaults
166
+ default_top_k = 50
167
+ default_top_p = 1.0
168
+ default_temperature = 0.7
169
+ default_repetition_penalty = 1.0
170
+
171
+ top_k_to_use = (
172
+ top_k if top_k is not None else search_config.get("top_k", default_top_k)
173
+ )
174
+ top_p_to_use = (
175
+ top_p if top_p is not None else search_config.get("top_p", default_top_p)
176
+ )
177
+ temperature_to_use = (
178
+ temperature
179
+ if temperature is not None
180
+ else search_config.get("temperature", default_temperature)
181
+ )
182
+ # Map the llamacpp name, `repeat_penalty`, to the OGA name, `repetition_penalty`
183
+ repetition_penalty_to_use = (
184
+ repeat_penalty
185
+ if repeat_penalty is not None
186
+ else search_config.get("repetition_penalty", default_repetition_penalty)
187
+ )
188
+
189
+ # Set search options once with all parameters
190
+ params.set_search_options(
191
+ do_sample=search_config.get("do_sample", do_sample),
192
+ top_k=top_k_to_use,
193
+ top_p=top_p_to_use,
194
+ temperature=temperature_to_use,
195
+ repetition_penalty=repetition_penalty_to_use,
196
+ max_length=max_length_to_use,
197
+ min_length=min_length,
198
+ early_stopping=search_config.get("early_stopping", False),
199
+ length_penalty=search_config.get("length_penalty", 1.0),
200
+ num_beams=search_config.get("num_beams", 1),
201
+ num_return_sequences=search_config.get("num_return_sequences", 1),
202
+ past_present_share_buffer=search_config.get(
203
+ "past_present_share_buffer", True
204
+ ),
205
+ random_seed=random_seed,
206
+ # Not currently supported by OGA
207
+ # diversity_penalty=search_config.get('diversity_penalty', 0.0),
208
+ # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
209
+ )
189
210
  params.try_graph_capture_with_max_batch_size(1)
190
211
 
191
212
  generator = og.Generator(self.model, params)
@@ -43,6 +43,72 @@ def llamacpp_address(port: int) -> str:
43
43
  return f"http://127.0.0.1:{port}/v1"
44
44
 
45
45
 
46
+ def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
47
+ """
48
+ Separate standard OpenAI parameters from custom llama.cpp parameters.
49
+
50
+ Args:
51
+ request_dict: Dictionary of all request parameters
52
+ endpoint_type: Type of endpoint ("chat" or "completion")
53
+
54
+ Returns:
55
+ Dictionary with parameters properly separated for OpenAI client
56
+ """
57
+ openai_client_params = {}
58
+ extra_params = {}
59
+
60
+ # Common OpenAI parameters for both endpoint types
61
+ common_params = {
62
+ "model",
63
+ "frequency_penalty",
64
+ "logit_bias",
65
+ "logprobs",
66
+ "max_tokens",
67
+ "n",
68
+ "presence_penalty",
69
+ "seed",
70
+ "stop",
71
+ "stream",
72
+ "temperature",
73
+ "top_p",
74
+ "user",
75
+ }
76
+
77
+ # Standard OpenAI parameters by endpoint type
78
+ if endpoint_type == "chat":
79
+ chat_specific_params = {
80
+ "messages",
81
+ "top_logprobs",
82
+ "response_format",
83
+ "service_tier",
84
+ "stream_options",
85
+ "tools",
86
+ "tool_choice",
87
+ "parallel_tool_calls",
88
+ }
89
+ openai_params = common_params | chat_specific_params
90
+ else: # completion
91
+ completion_specific_params = {
92
+ "prompt",
93
+ "best_of",
94
+ "echo",
95
+ "suffix",
96
+ }
97
+ openai_params = common_params | completion_specific_params
98
+
99
+ for key, value in request_dict.items():
100
+ if key in openai_params:
101
+ openai_client_params[key] = value
102
+ else:
103
+ extra_params[key] = value
104
+
105
+ # If there are custom parameters, use extra_body to pass them through
106
+ if extra_params:
107
+ openai_client_params["extra_body"] = extra_params
108
+
109
+ return openai_client_params
110
+
111
+
46
112
  class LlamaTelemetry:
47
113
  """
48
114
  Manages telemetry data collection and display for llama server.
@@ -226,6 +292,11 @@ def _launch_llama_subprocess(
226
292
  "--ctx-size",
227
293
  str(ctx_size),
228
294
  ]
295
+
296
+ # Lock random seed for deterministic behavior in CI
297
+ if os.environ.get("LEMONADE_CI_MODE"):
298
+ base_command.extend(["--seed", "42"])
299
+
229
300
  if "mmproj" in snapshot_files:
230
301
  base_command.extend(["--mmproj", snapshot_files["mmproj"]])
231
302
  if not use_gpu:
@@ -238,6 +309,15 @@ def _launch_llama_subprocess(
238
309
  # Add port and jinja to enable tool use
239
310
  base_command.extend(["--port", str(telemetry.port), "--jinja"])
240
311
 
312
+ # Disable jinja for gpt-oss-120b on Vulkan
313
+ if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
314
+ base_command.remove("--jinja")
315
+ logging.warning(
316
+ "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
317
+ "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
318
+ "The model cannot use tools. If needed, use the ROCm backend instead."
319
+ )
320
+
241
321
  # Use legacy reasoning formatting, since not all apps support the new
242
322
  # reasoning_content field
243
323
  base_command.extend(["--reasoning-format", "none"])
@@ -384,13 +464,17 @@ def chat_completion(
384
464
  exclude_unset=True, exclude_none=True
385
465
  )
386
466
 
467
+ # Separate standard OpenAI parameters from custom llama.cpp parameters
468
+ openai_client_params = _separate_openai_params(request_dict, "chat")
469
+
387
470
  # Check if streaming is requested
388
471
  if chat_completion_request.stream:
389
472
 
390
473
  def event_stream():
391
474
  try:
392
475
  # Enable streaming
393
- for chunk in client.chat.completions.create(**request_dict):
476
+ # pylint: disable=missing-kwoa
477
+ for chunk in client.chat.completions.create(**openai_client_params):
394
478
  yield f"data: {chunk.model_dump_json()}\n\n"
395
479
  yield "data: [DONE]\n\n"
396
480
 
@@ -412,7 +496,8 @@ def chat_completion(
412
496
  # Non-streaming response
413
497
  try:
414
498
  # Disable streaming for non-streaming requests
415
- response = client.chat.completions.create(**request_dict)
499
+ # pylint: disable=missing-kwoa
500
+ response = client.chat.completions.create(**openai_client_params)
416
501
 
417
502
  # Show telemetry after completion
418
503
  telemetry.show_telemetry()
@@ -420,6 +505,7 @@ def chat_completion(
420
505
  return response
421
506
 
422
507
  except Exception as e: # pylint: disable=broad-exception-caught
508
+ logging.error("Error during chat completion: %s", str(e))
423
509
  raise HTTPException(
424
510
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
425
511
  detail=f"Chat completion error: {str(e)}",
@@ -446,13 +532,17 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
446
532
  # Convert Pydantic model to dict and remove unset/null values
447
533
  request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
448
534
 
535
+ # Separate standard OpenAI parameters from custom llama.cpp parameters
536
+ openai_client_params = _separate_openai_params(request_dict, "completion")
537
+
449
538
  # Check if streaming is requested
450
539
  if completion_request.stream:
451
540
 
452
541
  def event_stream():
453
542
  try:
454
543
  # Enable streaming
455
- for chunk in client.completions.create(**request_dict):
544
+ # pylint: disable=missing-kwoa
545
+ for chunk in client.completions.create(**openai_client_params):
456
546
  yield f"data: {chunk.model_dump_json()}\n\n"
457
547
  yield "data: [DONE]\n\n"
458
548
 
@@ -474,7 +564,8 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
474
564
  # Non-streaming response
475
565
  try:
476
566
  # Disable streaming for non-streaming requests
477
- response = client.completions.create(**request_dict)
567
+ # pylint: disable=missing-kwoa
568
+ response = client.completions.create(**openai_client_params)
478
569
 
479
570
  # Show telemetry after completion
480
571
  telemetry.show_telemetry()
@@ -482,6 +573,7 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
482
573
  return response
483
574
 
484
575
  except Exception as e: # pylint: disable=broad-exception-caught
576
+ logging.error("Error during completion: %s", str(e))
485
577
  raise HTTPException(
486
578
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
487
579
  detail=f"Completion error: {str(e)}",
@@ -54,7 +54,11 @@ from lemonade.tools.server.utils.port import lifespan
54
54
 
55
55
  from lemonade_server.model_manager import ModelManager
56
56
  from lemonade_server.pydantic_models import (
57
- DEFAULT_MAX_NEW_TOKENS,
57
+ DEFAULT_PORT,
58
+ DEFAULT_HOST,
59
+ DEFAULT_LOG_LEVEL,
60
+ DEFAULT_LLAMACPP_BACKEND,
61
+ DEFAULT_CTX_SIZE,
58
62
  LoadConfig,
59
63
  CompletionRequest,
60
64
  ChatCompletionRequest,
@@ -65,19 +69,16 @@ from lemonade_server.pydantic_models import (
65
69
  DeleteConfig,
66
70
  )
67
71
 
72
+ # Set to a high number to allow for interesting experiences in real apps
73
+ # Tests should use the max_new_tokens argument to set a lower value
74
+ DEFAULT_MAX_NEW_TOKENS = 1500
75
+
68
76
  # Only import tray on Windows
69
77
  if platform.system() == "Windows":
70
78
  # pylint: disable=ungrouped-imports
71
79
  from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
72
80
 
73
81
 
74
- DEFAULT_PORT = 8000
75
- DEFAULT_HOST = "localhost"
76
- DEFAULT_LOG_LEVEL = "info"
77
- DEFAULT_LLAMACPP_BACKEND = "vulkan"
78
- DEFAULT_CTX_SIZE = 4096
79
-
80
-
81
82
  class ServerModel(Model):
82
83
  """
83
84
  An extension of OpenAI's Model class that adds
@@ -258,6 +259,47 @@ class Server:
258
259
  self.app.post(f"{prefix}/reranking")(self.reranking)
259
260
  self.app.post(f"{prefix}/rerank")(self.reranking)
260
261
 
262
+ def _log_request_parameters(self, request, endpoint_name: str):
263
+ """
264
+ Log request parameters excluding content fields like messages, prompt, or input.
265
+
266
+ Args:
267
+ request: Any request object (CompletionRequest, ChatCompletionRequest, etc.)
268
+ endpoint_name: Name of the endpoint for logging context
269
+ """
270
+ if not logging.getLogger().isEnabledFor(logging.DEBUG):
271
+ return
272
+
273
+ # Fields to exclude from logging (content fields)
274
+ excluded_fields = {"messages", "prompt", "input"}
275
+
276
+ # Get all attributes from the request object
277
+ request_params = {}
278
+ if hasattr(request, "__dict__"):
279
+ # For pydantic models, get the dict representation
280
+ if hasattr(request, "model_dump"):
281
+ all_params = request.model_dump()
282
+ elif hasattr(request, "dict"):
283
+ all_params = request.dict()
284
+ else:
285
+ all_params = request.__dict__
286
+
287
+ # Filter out excluded fields and add special handling for certain fields
288
+ for key, value in all_params.items():
289
+ if key not in excluded_fields:
290
+ # Special handling for tools field - show count instead of full content
291
+ if key == "tools" and value is not None:
292
+ request_params[key] = (
293
+ f"{len(value)} tools" if isinstance(value, list) else value
294
+ )
295
+ # Special handling for input type in responses
296
+ elif key == "input" and hasattr(request, "input"):
297
+ request_params["input_type"] = type(value).__name__
298
+ else:
299
+ request_params[key] = value
300
+
301
+ logging.debug(f"{endpoint_name} request parameters: {request_params}")
302
+
261
303
  def _setup_server_common(
262
304
  self,
263
305
  tray: bool = False,
@@ -435,6 +477,9 @@ class Server:
435
477
 
436
478
  lc = self.initialize_load_config(completion_request)
437
479
 
480
+ # Log request parameters (excluding message content for brevity)
481
+ self._log_request_parameters(completion_request, "Completions")
482
+
438
483
  # Load the model if it's different from the currently loaded one
439
484
  await self.load_llm(lc)
440
485
 
@@ -456,6 +501,9 @@ class Server:
456
501
  "message": text,
457
502
  "stop": completion_request.stop,
458
503
  "temperature": completion_request.temperature,
504
+ "repeat_penalty": completion_request.repeat_penalty,
505
+ "top_k": completion_request.top_k,
506
+ "top_p": completion_request.top_p,
459
507
  "max_new_tokens": completion_request.max_tokens,
460
508
  }
461
509
 
@@ -564,6 +612,9 @@ class Server:
564
612
 
565
613
  lc = self.initialize_load_config(chat_completion_request)
566
614
 
615
+ # Log request parameters (excluding message history for brevity)
616
+ self._log_request_parameters(chat_completion_request, "Chat completions")
617
+
567
618
  # Load the model if it's different from the currently loaded one
568
619
  await self.load_llm(lc)
569
620
 
@@ -608,6 +659,9 @@ class Server:
608
659
  "message": text,
609
660
  "stop": chat_completion_request.stop,
610
661
  "temperature": chat_completion_request.temperature,
662
+ "repeat_penalty": chat_completion_request.repeat_penalty,
663
+ "top_k": chat_completion_request.top_k,
664
+ "top_p": chat_completion_request.top_p,
611
665
  "max_new_tokens": max_new_tokens,
612
666
  }
613
667
 
@@ -856,6 +910,9 @@ class Server:
856
910
 
857
911
  lc = self.initialize_load_config(responses_request)
858
912
 
913
+ # Log request parameters (excluding message history for brevity)
914
+ self._log_request_parameters(responses_request, "Responses")
915
+
859
916
  # Load the model if it's different from the currently loaded one
860
917
  await self.load_llm(lc)
861
918
 
@@ -877,6 +934,9 @@ class Server:
877
934
  generation_args = {
878
935
  "message": text,
879
936
  "temperature": responses_request.temperature,
937
+ "repeat_penalty": responses_request.repeat_penalty,
938
+ "top_k": responses_request.top_k,
939
+ "top_p": responses_request.top_p,
880
940
  "max_new_tokens": responses_request.max_output_tokens,
881
941
  }
882
942
 
@@ -1006,6 +1066,9 @@ class Server:
1006
1066
  stop: list[str] | str | None = None,
1007
1067
  max_new_tokens: int | None = None,
1008
1068
  temperature: float | None = None,
1069
+ repeat_penalty: float | None = None,
1070
+ top_k: int | None = None,
1071
+ top_p: float | None = None,
1009
1072
  ):
1010
1073
  """
1011
1074
  Core streaming completion logic, separated from response handling.
@@ -1088,6 +1151,9 @@ class Server:
1088
1151
  "pad_token_id": tokenizer.eos_token_id,
1089
1152
  "stopping_criteria": stopping_criteria,
1090
1153
  "temperature": temperature,
1154
+ "repeat_penalty": repeat_penalty,
1155
+ "top_k": top_k,
1156
+ "top_p": top_p,
1091
1157
  }
1092
1158
 
1093
1159
  # Initialize performance variables