lemonade-sdk 8.1.1__py3-none-any.whl → 8.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -5,7 +5,6 @@ import importlib.metadata
5
5
  import subprocess
6
6
  from abc import ABC, abstractmethod
7
7
  from typing import Dict, Optional
8
- import transformers
9
8
 
10
9
 
11
10
  class InferenceEngineDetector:
@@ -352,6 +351,7 @@ class TransformersDetector(BaseEngineDetector):
352
351
 
353
352
  try:
354
353
  import torch
354
+ import transformers
355
355
 
356
356
  if device_type == "cpu":
357
357
  result = {
@@ -57,7 +57,7 @@ def identify_rocm_arch_from_name(device_name: str) -> str | None:
57
57
  return None
58
58
 
59
59
 
60
- def identify_rocm_arch_and_hip_id() -> tuple[str, str]:
60
+ def identify_rocm_arch() -> str:
61
61
  """
62
62
  Identify the appropriate ROCm target architecture based on the device info
63
63
  Returns tuple of (architecture, gpu_type) where gpu_type is 'igpu' or 'dgpu'
@@ -68,21 +68,54 @@ def identify_rocm_arch_and_hip_id() -> tuple[str, str]:
68
68
  amd_igpu = system_info.get_amd_igpu_device()
69
69
  amd_dgpu = system_info.get_amd_dgpu_devices()
70
70
  target_arch = None
71
- gpu_count = 0
72
71
  for gpu in [amd_igpu] + amd_dgpu:
73
72
  if gpu.get("available") and gpu.get("name"):
74
- gpu_count += 1
75
73
  target_arch = identify_rocm_arch_from_name(gpu["name"].lower())
76
74
  if target_arch:
77
75
  break
78
76
 
79
- # Get HIP ID based on the number of GPUs available
80
- # Here, we assume that the iGPU will always show up before the dGPUs (if available)
81
- # We also assume that selecting the dGPU is preferred over the iGPU
82
- # Multiple GPUs are not supported at the moment
83
- hip_id = str(gpu_count - 1)
77
+ return target_arch
84
78
 
85
- return target_arch, hip_id
79
+
80
+ def identify_hip_id() -> str:
81
+ """
82
+ Identify the HIP ID
83
+ """
84
+ # Get HIP devices
85
+ hip_devices = get_hip_devices()
86
+ logging.debug(f"HIP devices found: {hip_devices}")
87
+ if len(hip_devices) == 0:
88
+ raise ValueError("No HIP devices found when identifying HIP ID")
89
+
90
+ # Identify HIP devices that are compatible with our ROCm builds
91
+ rocm_devices = []
92
+ for device in hip_devices:
93
+ device_id, device_name = device
94
+ if identify_rocm_arch_from_name(device_name):
95
+ rocm_devices.append([device_id, device_name])
96
+ logging.debug(f"ROCm devices found: {rocm_devices}")
97
+
98
+ # If no ROCm devices are found, use the last HIP device
99
+ # This might be needed in some scenarios where HIP reports generic device names
100
+ # Example: "AMD Radeon Graphics" for STX Halo iGPU on Ubuntu 24.04
101
+ if len(rocm_devices) == 0:
102
+ rocm_devices = [hip_devices[-1]]
103
+ logging.warning(
104
+ "No ROCm devices found when identifying HIP ID. "
105
+ f"Falling back to the following device: {rocm_devices[0]}"
106
+ )
107
+ elif len(rocm_devices) > 1:
108
+ logging.warning(
109
+ f"Multiple ROCm devices found when identifying HIP ID: {rocm_devices}"
110
+ "The last device will be used."
111
+ )
112
+
113
+ # Select the last device
114
+ device_selected = rocm_devices[-1]
115
+ logging.debug(f"Selected ROCm device: {device_selected}")
116
+
117
+ # Return the device ID
118
+ return device_selected[0]
86
119
 
87
120
 
88
121
  def get_llama_version(backend: str) -> str:
@@ -277,7 +310,7 @@ def install_llamacpp(backend):
277
310
  target_arch = None
278
311
  if backend == "rocm":
279
312
  # Identify the target architecture
280
- target_arch, hip_id = identify_rocm_arch_and_hip_id()
313
+ target_arch = identify_rocm_arch()
281
314
  if not target_arch:
282
315
  system = platform.system().lower()
283
316
  if system == "linux":
@@ -293,10 +326,6 @@ def install_llamacpp(backend):
293
326
  f"for supported configurations. {hint}"
294
327
  )
295
328
 
296
- # Set HIP_VISIBLE_DEVICES=0 for igpu, =1 for dgpu
297
- env_file_path = os.path.join(llama_server_exe_dir, ".env")
298
- set_key(env_file_path, "HIP_VISIBLE_DEVICES", hip_id)
299
-
300
329
  # Direct download for Vulkan/ROCm
301
330
  llama_archive_url, filename = get_binary_url_and_filename(backend, target_arch)
302
331
  llama_archive_path = os.path.join(llama_server_exe_dir, filename)
@@ -315,6 +344,12 @@ def install_llamacpp(backend):
315
344
  else:
316
345
  raise NotImplementedError(f"Unsupported archive format: {filename}")
317
346
 
347
+ # Identify and set HIP ID
348
+ if backend == "rocm":
349
+ hip_id = identify_hip_id()
350
+ env_file_path = os.path.join(llama_server_exe_dir, ".env")
351
+ set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
352
+
318
353
  # Make executable on Linux - need to update paths after extraction
319
354
  if platform.system().lower() == "linux":
320
355
  # Re-get the paths since extraction might have changed the directory structure
@@ -778,3 +813,68 @@ class LlamaCppAdapter(ModelAdapter):
778
813
  error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
779
814
  error_msg += f"Command: {' '.join(cmd)}"
780
815
  raise Exception(error_msg)
816
+
817
+
818
+ def get_hip_devices():
819
+ """Get list of HIP devices with their IDs and names."""
820
+ import ctypes
821
+ import sys
822
+ import os
823
+ import glob
824
+ from ctypes import c_int, POINTER
825
+ from ctypes.util import find_library
826
+
827
+ # Get llama.cpp path
828
+ rocm_path = get_llama_folder_path("rocm")
829
+
830
+ # Load HIP library
831
+ hip_library_pattern = (
832
+ "amdhip64*.dll" if sys.platform.startswith("win") else "libamdhip64*.so"
833
+ )
834
+ search_pattern = os.path.join(rocm_path, hip_library_pattern)
835
+ matching_files = glob.glob(search_pattern)
836
+ if not matching_files:
837
+ raise RuntimeError(
838
+ f"Could not find HIP runtime library matching pattern: {search_pattern}"
839
+ )
840
+ try:
841
+ libhip = ctypes.CDLL(matching_files[0])
842
+ except OSError:
843
+ raise RuntimeError(f"Could not load HIP runtime library from {path}")
844
+
845
+ # Setup function signatures
846
+ hipError_t = c_int
847
+ hipDeviceProp_t = ctypes.c_char * 2048
848
+ libhip.hipGetDeviceCount.restype = hipError_t
849
+ libhip.hipGetDeviceCount.argtypes = [POINTER(c_int)]
850
+ libhip.hipGetDeviceProperties.restype = hipError_t
851
+ libhip.hipGetDeviceProperties.argtypes = [POINTER(hipDeviceProp_t), c_int]
852
+ libhip.hipGetErrorString.restype = ctypes.c_char_p
853
+ libhip.hipGetErrorString.argtypes = [hipError_t]
854
+
855
+ # Get device count
856
+ device_count = c_int()
857
+ err = libhip.hipGetDeviceCount(ctypes.byref(device_count))
858
+ if err != 0:
859
+ logging.error(
860
+ "hipGetDeviceCount failed:", libhip.hipGetErrorString(err).decode()
861
+ )
862
+ return []
863
+
864
+ # Get device properties
865
+ devices = []
866
+ for i in range(device_count.value):
867
+ prop = hipDeviceProp_t()
868
+ err = libhip.hipGetDeviceProperties(ctypes.byref(prop), i)
869
+ if err != 0:
870
+ logging.error(
871
+ f"hipGetDeviceProperties failed for device {i}:",
872
+ libhip.hipGetErrorString(err).decode(),
873
+ )
874
+ continue
875
+
876
+ # Extract device name from HIP device properties
877
+ device_name = ctypes.string_at(prop, 256).decode("utf-8").rstrip("\x00")
878
+ devices.append([i, device_name])
879
+
880
+ return devices
@@ -109,7 +109,7 @@ class Cache(ManagementTool):
109
109
  # pylint: disable=pointless-statement,f-string-without-interpolation
110
110
  f"""
111
111
  A set of functions for managing the lemonade build cache. The default
112
- cache location is {lemonade_cache.DEFAULT_CACHE_DIR}, and can also be
112
+ cache location is {lemonade_cache.DEFAULT_CACHE_DIR}, and can also be
113
113
  selected with
114
114
  the global --cache-dir option or the LEMONADE_CACHE_DIR environment variable.
115
115
 
@@ -100,9 +100,10 @@ class OrtGenaiModel(ModelAdapter):
100
100
  max_new_tokens=512,
101
101
  min_new_tokens=0,
102
102
  do_sample=True,
103
- top_k=50,
104
- top_p=1.0,
105
- temperature=0.7,
103
+ top_k=None,
104
+ top_p=None,
105
+ temperature=None,
106
+ repeat_penalty=None,
106
107
  streamer: OrtGenaiStreamer = None,
107
108
  pad_token_id=None,
108
109
  stopping_criteria=None,
@@ -154,38 +155,58 @@ class OrtGenaiModel(ModelAdapter):
154
155
  if random_seed is None:
155
156
  random_seed = -1 # In og.Generator, -1 = seed with random device
156
157
 
158
+ # Get search config if available, otherwise use empty dict
159
+ # Thanks to the empty dict, if the model doesn't have a built-in search
160
+ # config, the .get() calls will all just use the default values
161
+ search_config = {}
157
162
  if self.config and "search" in self.config:
158
163
  search_config = self.config["search"]
159
- params.set_search_options(
160
- do_sample=search_config.get("do_sample", do_sample),
161
- top_k=search_config.get("top_k", top_k),
162
- top_p=search_config.get("top_p", top_p),
163
- temperature=search_config.get("temperature", temperature),
164
- max_length=max_length_to_use,
165
- min_length=min_length,
166
- early_stopping=search_config.get("early_stopping", False),
167
- length_penalty=search_config.get("length_penalty", 1.0),
168
- num_beams=search_config.get("num_beams", 1),
169
- num_return_sequences=search_config.get("num_return_sequences", 1),
170
- repetition_penalty=search_config.get("repetition_penalty", 1.0),
171
- past_present_share_buffer=search_config.get(
172
- "past_present_share_buffer", True
173
- ),
174
- random_seed=random_seed,
175
- # Not currently supported by OGA
176
- # diversity_penalty=search_config.get('diversity_penalty', 0.0),
177
- # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
178
- )
179
- else:
180
- params.set_search_options(
181
- do_sample=do_sample,
182
- top_k=top_k,
183
- top_p=top_p,
184
- temperature=temperature,
185
- max_length=max_length_to_use,
186
- min_length=min_length,
187
- random_seed=random_seed,
188
- )
164
+
165
+ # Apply parameter hierarchy: user provided > search config > defaults
166
+ default_top_k = 50
167
+ default_top_p = 1.0
168
+ default_temperature = 0.7
169
+ default_repetition_penalty = 1.0
170
+
171
+ top_k_to_use = (
172
+ top_k if top_k is not None else search_config.get("top_k", default_top_k)
173
+ )
174
+ top_p_to_use = (
175
+ top_p if top_p is not None else search_config.get("top_p", default_top_p)
176
+ )
177
+ temperature_to_use = (
178
+ temperature
179
+ if temperature is not None
180
+ else search_config.get("temperature", default_temperature)
181
+ )
182
+ # Map the llamacpp name, `repeat_penalty`, to the OGA name, `repetition_penalty`
183
+ repetition_penalty_to_use = (
184
+ repeat_penalty
185
+ if repeat_penalty is not None
186
+ else search_config.get("repetition_penalty", default_repetition_penalty)
187
+ )
188
+
189
+ # Set search options once with all parameters
190
+ params.set_search_options(
191
+ do_sample=search_config.get("do_sample", do_sample),
192
+ top_k=top_k_to_use,
193
+ top_p=top_p_to_use,
194
+ temperature=temperature_to_use,
195
+ repetition_penalty=repetition_penalty_to_use,
196
+ max_length=max_length_to_use,
197
+ min_length=min_length,
198
+ early_stopping=search_config.get("early_stopping", False),
199
+ length_penalty=search_config.get("length_penalty", 1.0),
200
+ num_beams=search_config.get("num_beams", 1),
201
+ num_return_sequences=search_config.get("num_return_sequences", 1),
202
+ past_present_share_buffer=search_config.get(
203
+ "past_present_share_buffer", True
204
+ ),
205
+ random_seed=random_seed,
206
+ # Not currently supported by OGA
207
+ # diversity_penalty=search_config.get('diversity_penalty', 0.0),
208
+ # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
209
+ )
189
210
  params.try_graph_capture_with_max_batch_size(1)
190
211
 
191
212
  generator = og.Generator(self.model, params)
@@ -43,6 +43,72 @@ def llamacpp_address(port: int) -> str:
43
43
  return f"http://127.0.0.1:{port}/v1"
44
44
 
45
45
 
46
+ def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
47
+ """
48
+ Separate standard OpenAI parameters from custom llama.cpp parameters.
49
+
50
+ Args:
51
+ request_dict: Dictionary of all request parameters
52
+ endpoint_type: Type of endpoint ("chat" or "completion")
53
+
54
+ Returns:
55
+ Dictionary with parameters properly separated for OpenAI client
56
+ """
57
+ openai_client_params = {}
58
+ extra_params = {}
59
+
60
+ # Common OpenAI parameters for both endpoint types
61
+ common_params = {
62
+ "model",
63
+ "frequency_penalty",
64
+ "logit_bias",
65
+ "logprobs",
66
+ "max_tokens",
67
+ "n",
68
+ "presence_penalty",
69
+ "seed",
70
+ "stop",
71
+ "stream",
72
+ "temperature",
73
+ "top_p",
74
+ "user",
75
+ }
76
+
77
+ # Standard OpenAI parameters by endpoint type
78
+ if endpoint_type == "chat":
79
+ chat_specific_params = {
80
+ "messages",
81
+ "top_logprobs",
82
+ "response_format",
83
+ "service_tier",
84
+ "stream_options",
85
+ "tools",
86
+ "tool_choice",
87
+ "parallel_tool_calls",
88
+ }
89
+ openai_params = common_params | chat_specific_params
90
+ else: # completion
91
+ completion_specific_params = {
92
+ "prompt",
93
+ "best_of",
94
+ "echo",
95
+ "suffix",
96
+ }
97
+ openai_params = common_params | completion_specific_params
98
+
99
+ for key, value in request_dict.items():
100
+ if key in openai_params:
101
+ openai_client_params[key] = value
102
+ else:
103
+ extra_params[key] = value
104
+
105
+ # If there are custom parameters, use extra_body to pass them through
106
+ if extra_params:
107
+ openai_client_params["extra_body"] = extra_params
108
+
109
+ return openai_client_params
110
+
111
+
46
112
  class LlamaTelemetry:
47
113
  """
48
114
  Manages telemetry data collection and display for llama server.
@@ -226,6 +292,11 @@ def _launch_llama_subprocess(
226
292
  "--ctx-size",
227
293
  str(ctx_size),
228
294
  ]
295
+
296
+ # Lock random seed for deterministic behavior in CI
297
+ if os.environ.get("LEMONADE_CI_MODE"):
298
+ base_command.extend(["--seed", "42"])
299
+
229
300
  if "mmproj" in snapshot_files:
230
301
  base_command.extend(["--mmproj", snapshot_files["mmproj"]])
231
302
  if not use_gpu:
@@ -238,6 +309,15 @@ def _launch_llama_subprocess(
238
309
  # Add port and jinja to enable tool use
239
310
  base_command.extend(["--port", str(telemetry.port), "--jinja"])
240
311
 
312
+ # Disable jinja for gpt-oss-120b on Vulkan
313
+ if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
314
+ base_command.remove("--jinja")
315
+ logging.warning(
316
+ "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
317
+ "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
318
+ "The model cannot use tools. If needed, use the ROCm backend instead."
319
+ )
320
+
241
321
  # Use legacy reasoning formatting, since not all apps support the new
242
322
  # reasoning_content field
243
323
  base_command.extend(["--reasoning-format", "none"])
@@ -384,13 +464,17 @@ def chat_completion(
384
464
  exclude_unset=True, exclude_none=True
385
465
  )
386
466
 
467
+ # Separate standard OpenAI parameters from custom llama.cpp parameters
468
+ openai_client_params = _separate_openai_params(request_dict, "chat")
469
+
387
470
  # Check if streaming is requested
388
471
  if chat_completion_request.stream:
389
472
 
390
473
  def event_stream():
391
474
  try:
392
475
  # Enable streaming
393
- for chunk in client.chat.completions.create(**request_dict):
476
+ # pylint: disable=missing-kwoa
477
+ for chunk in client.chat.completions.create(**openai_client_params):
394
478
  yield f"data: {chunk.model_dump_json()}\n\n"
395
479
  yield "data: [DONE]\n\n"
396
480
 
@@ -412,7 +496,8 @@ def chat_completion(
412
496
  # Non-streaming response
413
497
  try:
414
498
  # Disable streaming for non-streaming requests
415
- response = client.chat.completions.create(**request_dict)
499
+ # pylint: disable=missing-kwoa
500
+ response = client.chat.completions.create(**openai_client_params)
416
501
 
417
502
  # Show telemetry after completion
418
503
  telemetry.show_telemetry()
@@ -420,6 +505,7 @@ def chat_completion(
420
505
  return response
421
506
 
422
507
  except Exception as e: # pylint: disable=broad-exception-caught
508
+ logging.error("Error during chat completion: %s", str(e))
423
509
  raise HTTPException(
424
510
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
425
511
  detail=f"Chat completion error: {str(e)}",
@@ -446,13 +532,17 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
446
532
  # Convert Pydantic model to dict and remove unset/null values
447
533
  request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
448
534
 
535
+ # Separate standard OpenAI parameters from custom llama.cpp parameters
536
+ openai_client_params = _separate_openai_params(request_dict, "completion")
537
+
449
538
  # Check if streaming is requested
450
539
  if completion_request.stream:
451
540
 
452
541
  def event_stream():
453
542
  try:
454
543
  # Enable streaming
455
- for chunk in client.completions.create(**request_dict):
544
+ # pylint: disable=missing-kwoa
545
+ for chunk in client.completions.create(**openai_client_params):
456
546
  yield f"data: {chunk.model_dump_json()}\n\n"
457
547
  yield "data: [DONE]\n\n"
458
548
 
@@ -474,7 +564,8 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
474
564
  # Non-streaming response
475
565
  try:
476
566
  # Disable streaming for non-streaming requests
477
- response = client.completions.create(**request_dict)
567
+ # pylint: disable=missing-kwoa
568
+ response = client.completions.create(**openai_client_params)
478
569
 
479
570
  # Show telemetry after completion
480
571
  telemetry.show_telemetry()
@@ -482,6 +573,7 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
482
573
  return response
483
574
 
484
575
  except Exception as e: # pylint: disable=broad-exception-caught
576
+ logging.error("Error during completion: %s", str(e))
485
577
  raise HTTPException(
486
578
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
487
579
  detail=f"Completion error: {str(e)}",
@@ -54,7 +54,11 @@ from lemonade.tools.server.utils.port import lifespan
54
54
 
55
55
  from lemonade_server.model_manager import ModelManager
56
56
  from lemonade_server.pydantic_models import (
57
- DEFAULT_MAX_NEW_TOKENS,
57
+ DEFAULT_PORT,
58
+ DEFAULT_HOST,
59
+ DEFAULT_LOG_LEVEL,
60
+ DEFAULT_LLAMACPP_BACKEND,
61
+ DEFAULT_CTX_SIZE,
58
62
  LoadConfig,
59
63
  CompletionRequest,
60
64
  ChatCompletionRequest,
@@ -65,18 +69,16 @@ from lemonade_server.pydantic_models import (
65
69
  DeleteConfig,
66
70
  )
67
71
 
72
+ # Set to a high number to allow for interesting experiences in real apps
73
+ # Tests should use the max_new_tokens argument to set a lower value
74
+ DEFAULT_MAX_NEW_TOKENS = 1500
75
+
68
76
  # Only import tray on Windows
69
77
  if platform.system() == "Windows":
70
78
  # pylint: disable=ungrouped-imports
71
79
  from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
72
80
 
73
81
 
74
- DEFAULT_PORT = 8000
75
- DEFAULT_LOG_LEVEL = "info"
76
- DEFAULT_LLAMACPP_BACKEND = "vulkan"
77
- DEFAULT_CTX_SIZE = 4096
78
-
79
-
80
82
  class ServerModel(Model):
81
83
  """
82
84
  An extension of OpenAI's Model class that adds
@@ -150,6 +152,7 @@ class Server:
150
152
  def __init__(
151
153
  self,
152
154
  port: int = DEFAULT_PORT,
155
+ host: str = DEFAULT_HOST,
153
156
  log_level: str = DEFAULT_LOG_LEVEL,
154
157
  ctx_size: int = DEFAULT_CTX_SIZE,
155
158
  tray: bool = False,
@@ -160,6 +163,7 @@ class Server:
160
163
 
161
164
  # Save args as members
162
165
  self.port = port
166
+ self.host = host
163
167
  self.log_level = log_level
164
168
  self.ctx_size = ctx_size
165
169
  self.tray = tray
@@ -255,6 +259,47 @@ class Server:
255
259
  self.app.post(f"{prefix}/reranking")(self.reranking)
256
260
  self.app.post(f"{prefix}/rerank")(self.reranking)
257
261
 
262
+ def _log_request_parameters(self, request, endpoint_name: str):
263
+ """
264
+ Log request parameters excluding content fields like messages, prompt, or input.
265
+
266
+ Args:
267
+ request: Any request object (CompletionRequest, ChatCompletionRequest, etc.)
268
+ endpoint_name: Name of the endpoint for logging context
269
+ """
270
+ if not logging.getLogger().isEnabledFor(logging.DEBUG):
271
+ return
272
+
273
+ # Fields to exclude from logging (content fields)
274
+ excluded_fields = {"messages", "prompt", "input"}
275
+
276
+ # Get all attributes from the request object
277
+ request_params = {}
278
+ if hasattr(request, "__dict__"):
279
+ # For pydantic models, get the dict representation
280
+ if hasattr(request, "model_dump"):
281
+ all_params = request.model_dump()
282
+ elif hasattr(request, "dict"):
283
+ all_params = request.dict()
284
+ else:
285
+ all_params = request.__dict__
286
+
287
+ # Filter out excluded fields and add special handling for certain fields
288
+ for key, value in all_params.items():
289
+ if key not in excluded_fields:
290
+ # Special handling for tools field - show count instead of full content
291
+ if key == "tools" and value is not None:
292
+ request_params[key] = (
293
+ f"{len(value)} tools" if isinstance(value, list) else value
294
+ )
295
+ # Special handling for input type in responses
296
+ elif key == "input" and hasattr(request, "input"):
297
+ request_params["input_type"] = type(value).__name__
298
+ else:
299
+ request_params[key] = value
300
+
301
+ logging.debug(f"{endpoint_name} request parameters: {request_params}")
302
+
258
303
  def _setup_server_common(
259
304
  self,
260
305
  tray: bool = False,
@@ -332,6 +377,9 @@ class Server:
332
377
  # Let the app know what port it's running on, so
333
378
  # that the lifespan can access it
334
379
  self.app.port = self.port
380
+ # FastAPI already has a `host` function and we cannot use `_host` as
381
+ # PyLint will believe its private
382
+ self.app.host_ = self.host
335
383
 
336
384
  def run(self):
337
385
  # Common setup
@@ -340,9 +388,7 @@ class Server:
340
388
  tray=self.tray,
341
389
  )
342
390
 
343
- uvicorn.run(
344
- self.app, host="localhost", port=self.port, log_level=self.log_level
345
- )
391
+ uvicorn.run(self.app, host=self.host, port=self.port, log_level=self.log_level)
346
392
 
347
393
  def run_in_thread(self, host: str = "localhost"):
348
394
  """
@@ -431,6 +477,9 @@ class Server:
431
477
 
432
478
  lc = self.initialize_load_config(completion_request)
433
479
 
480
+ # Log request parameters (excluding message content for brevity)
481
+ self._log_request_parameters(completion_request, "Completions")
482
+
434
483
  # Load the model if it's different from the currently loaded one
435
484
  await self.load_llm(lc)
436
485
 
@@ -452,6 +501,9 @@ class Server:
452
501
  "message": text,
453
502
  "stop": completion_request.stop,
454
503
  "temperature": completion_request.temperature,
504
+ "repeat_penalty": completion_request.repeat_penalty,
505
+ "top_k": completion_request.top_k,
506
+ "top_p": completion_request.top_p,
455
507
  "max_new_tokens": completion_request.max_tokens,
456
508
  }
457
509
 
@@ -560,6 +612,9 @@ class Server:
560
612
 
561
613
  lc = self.initialize_load_config(chat_completion_request)
562
614
 
615
+ # Log request parameters (excluding message history for brevity)
616
+ self._log_request_parameters(chat_completion_request, "Chat completions")
617
+
563
618
  # Load the model if it's different from the currently loaded one
564
619
  await self.load_llm(lc)
565
620
 
@@ -604,6 +659,9 @@ class Server:
604
659
  "message": text,
605
660
  "stop": chat_completion_request.stop,
606
661
  "temperature": chat_completion_request.temperature,
662
+ "repeat_penalty": chat_completion_request.repeat_penalty,
663
+ "top_k": chat_completion_request.top_k,
664
+ "top_p": chat_completion_request.top_p,
607
665
  "max_new_tokens": max_new_tokens,
608
666
  }
609
667
 
@@ -852,6 +910,9 @@ class Server:
852
910
 
853
911
  lc = self.initialize_load_config(responses_request)
854
912
 
913
+ # Log request parameters (excluding message history for brevity)
914
+ self._log_request_parameters(responses_request, "Responses")
915
+
855
916
  # Load the model if it's different from the currently loaded one
856
917
  await self.load_llm(lc)
857
918
 
@@ -873,6 +934,9 @@ class Server:
873
934
  generation_args = {
874
935
  "message": text,
875
936
  "temperature": responses_request.temperature,
937
+ "repeat_penalty": responses_request.repeat_penalty,
938
+ "top_k": responses_request.top_k,
939
+ "top_p": responses_request.top_p,
876
940
  "max_new_tokens": responses_request.max_output_tokens,
877
941
  }
878
942
 
@@ -1002,6 +1066,9 @@ class Server:
1002
1066
  stop: list[str] | str | None = None,
1003
1067
  max_new_tokens: int | None = None,
1004
1068
  temperature: float | None = None,
1069
+ repeat_penalty: float | None = None,
1070
+ top_k: int | None = None,
1071
+ top_p: float | None = None,
1005
1072
  ):
1006
1073
  """
1007
1074
  Core streaming completion logic, separated from response handling.
@@ -1084,6 +1151,9 @@ class Server:
1084
1151
  "pad_token_id": tokenizer.eos_token_id,
1085
1152
  "stopping_criteria": stopping_criteria,
1086
1153
  "temperature": temperature,
1154
+ "repeat_penalty": repeat_penalty,
1155
+ "top_k": top_k,
1156
+ "top_p": top_p,
1087
1157
  }
1088
1158
 
1089
1159
  # Initialize performance variables