lemonade-sdk 8.1.4__py3-none-any.whl → 8.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

@@ -1,144 +1,28 @@
1
1
  import os
2
2
  import logging
3
- import time
4
3
  import subprocess
5
4
  import re
6
5
  import threading
7
6
  import platform
8
7
 
9
- import requests
10
- from tabulate import tabulate
11
8
  from dotenv import load_dotenv
12
- from fastapi import HTTPException, status
13
- from fastapi.responses import StreamingResponse
14
-
15
- from openai import OpenAI
16
9
 
17
10
  from lemonade_server.pydantic_models import (
18
- ChatCompletionRequest,
19
- CompletionRequest,
20
11
  PullConfig,
21
- EmbeddingsRequest,
22
- RerankingRequest,
23
12
  )
24
- from lemonade_server.model_manager import ModelManager
25
- from lemonade.tools.server.utils.port import find_free_port
26
13
  from lemonade.tools.llamacpp.utils import (
27
14
  get_llama_server_exe_path,
28
15
  install_llamacpp,
29
16
  download_gguf,
30
17
  )
18
+ from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
31
19
 
32
20
 
33
- def llamacpp_address(port: int) -> str:
34
- """
35
- Generate the base URL for the llamacpp server.
36
-
37
- Args:
38
- port: The port number the llamacpp server is running on
39
-
40
- Returns:
41
- The base URL for the llamacpp server
42
- """
43
- return f"http://127.0.0.1:{port}/v1"
44
-
45
-
46
- def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
47
- """
48
- Separate standard OpenAI parameters from custom llama.cpp parameters.
49
-
50
- Args:
51
- request_dict: Dictionary of all request parameters
52
- endpoint_type: Type of endpoint ("chat" or "completion")
53
-
54
- Returns:
55
- Dictionary with parameters properly separated for OpenAI client
56
- """
57
- openai_client_params = {}
58
- extra_params = {}
59
-
60
- # Common OpenAI parameters for both endpoint types
61
- common_params = {
62
- "model",
63
- "frequency_penalty",
64
- "logit_bias",
65
- "logprobs",
66
- "max_tokens",
67
- "n",
68
- "presence_penalty",
69
- "seed",
70
- "stop",
71
- "stream",
72
- "temperature",
73
- "top_p",
74
- "user",
75
- }
76
-
77
- # Standard OpenAI parameters by endpoint type
78
- if endpoint_type == "chat":
79
- chat_specific_params = {
80
- "messages",
81
- "top_logprobs",
82
- "response_format",
83
- "service_tier",
84
- "stream_options",
85
- "tools",
86
- "tool_choice",
87
- "parallel_tool_calls",
88
- }
89
- openai_params = common_params | chat_specific_params
90
- else: # completion
91
- completion_specific_params = {
92
- "prompt",
93
- "best_of",
94
- "echo",
95
- "suffix",
96
- }
97
- openai_params = common_params | completion_specific_params
98
-
99
- for key, value in request_dict.items():
100
- if key in openai_params:
101
- openai_client_params[key] = value
102
- else:
103
- extra_params[key] = value
104
-
105
- # If there are custom parameters, use extra_body to pass them through
106
- if extra_params:
107
- openai_client_params["extra_body"] = extra_params
108
-
109
- return openai_client_params
110
-
111
-
112
- class LlamaTelemetry:
21
+ class LlamaTelemetry(WrappedServerTelemetry):
113
22
  """
114
23
  Manages telemetry data collection and display for llama server.
115
24
  """
116
25
 
117
- def __init__(self):
118
- self.input_tokens = None
119
- self.output_tokens = None
120
- self.time_to_first_token = None
121
- self.tokens_per_second = None
122
- self.prompt_eval_time = None
123
- self.eval_time = None
124
- self.port = None
125
-
126
- def choose_port(self):
127
- """
128
- Users probably don't care what port we start llama-server on, so let's
129
- search for an empty port
130
- """
131
-
132
- self.port = find_free_port()
133
-
134
- if self.port is None:
135
- msg = "Failed to find an empty port to start llama-server on"
136
- logging.error(msg)
137
- raise HTTPException(
138
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
139
- detail=msg,
140
- )
141
-
142
26
  def parse_telemetry_line(self, line: str):
143
27
  """
144
28
  Parse telemetry data from llama server output lines.
@@ -186,468 +70,186 @@ class LlamaTelemetry:
186
70
  self.tokens_per_second = tokens_per_second
187
71
  return
188
72
 
189
- def get_telemetry_data(self):
190
- return {
191
- "input_tokens": self.input_tokens,
192
- "output_tokens": self.output_tokens,
193
- "time_to_first_token": self.time_to_first_token,
194
- "tokens_per_second": self.tokens_per_second,
195
- "decode_token_times": None,
196
- }
197
-
198
- def show_telemetry(self):
199
- # Check if debug logging is enabled
200
- if not logging.getLogger().isEnabledFor(logging.DEBUG):
201
- return
202
-
203
- # Prepare telemetry data (transposed format)
204
- telemetry = [
205
- ["Input tokens", self.input_tokens],
206
- ["Output tokens", self.output_tokens],
207
- ["TTFT (s)", f"{self.time_to_first_token:.2f}"],
208
- ["TPS", f"{self.tokens_per_second:.2f}"],
209
- ]
210
73
 
211
- table = tabulate(
212
- telemetry, headers=["Metric", "Value"], tablefmt="fancy_grid"
213
- ).split("\n")
74
+ class LlamaServer(WrappedServer):
75
+ def __init__(self, backend: str):
76
+ self.telemetry = LlamaTelemetry()
77
+ self.backend = backend
78
+ super().__init__(server_name="llama-server", telemetry=self.telemetry)
214
79
 
215
- # Show telemetry in debug while complying with uvicorn's log indentation
216
- logging.debug("\n ".join(table))
80
+ def install_server(self, backend=None):
81
+ """
82
+ Install the wrapped server
83
+ """
84
+ install_llamacpp(self.backend)
217
85
 
86
+ def download_model(
87
+ self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
88
+ ) -> dict:
89
+ """
90
+ Download a model for the wrapper server
91
+ """
92
+ return download_gguf(
93
+ config_checkpoint=config_checkpoint,
94
+ config_mmproj=config_mmproj,
95
+ do_not_upgrade=do_not_upgrade,
96
+ )
218
97
 
219
- def _log_subprocess_output(
220
- process: subprocess.Popen, prefix: str, telemetry: LlamaTelemetry
221
- ):
222
- """
223
- Read subprocess output line by line, log to debug, and parse telemetry
224
- """
98
+ def _launch_device_backend_subprocess(
99
+ self,
100
+ snapshot_files: dict,
101
+ use_gpu: bool,
102
+ ctx_size: int,
103
+ supports_embeddings: bool = False,
104
+ supports_reranking: bool = False,
105
+ ) -> subprocess.Popen:
106
+ """
107
+ Launch llama server subprocess with appropriate configuration.
108
+
109
+ Args:
110
+ snapshot_files: Dictionary of model files to load
111
+ use_gpu: Whether to use GPU acceleration
112
+ telemetry: Telemetry object for tracking performance metrics
113
+ backend: Backend to use (e.g., 'vulkan', 'rocm')
114
+ supports_embeddings: Whether the model supports embeddings
115
+ supports_reranking: Whether the model supports reranking
116
+
117
+ Returns:
118
+ Subprocess handle for the llama server
119
+ """
225
120
 
226
- if process.stdout:
227
- try:
228
- for line in iter(process.stdout.readline, ""):
229
- if line:
230
- line_stripped = line.strip()
231
- logging.debug("%s: %s", prefix, line_stripped)
232
-
233
- telemetry.parse_telemetry_line(line_stripped)
234
-
235
- if process.poll() is not None:
236
- break
237
- except UnicodeDecodeError as e:
238
- logging.debug("Unicode decode error reading subprocess output: %s", str(e))
239
- except Exception as e: # pylint: disable=broad-exception-caught
240
- logging.error("Unexpected error reading subprocess output: %s", str(e))
241
-
242
-
243
- def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
244
- status_code = None
245
- while not llama_server_process.poll() and status_code != 200:
246
- health_url = f"http://localhost:{port}/health"
247
- try:
248
- health_response = requests.get(health_url)
249
- except requests.exceptions.ConnectionError:
250
- logging.debug("Not able to connect to llama-server yet, will retry")
251
- else:
252
- status_code = health_response.status_code
253
- logging.debug(
254
- "Testing llama-server readiness (will retry until ready), "
255
- f"result: {health_response.json()}"
256
- )
257
- time.sleep(1)
258
-
259
-
260
- def _launch_llama_subprocess(
261
- snapshot_files: dict,
262
- use_gpu: bool,
263
- telemetry: LlamaTelemetry,
264
- backend: str,
265
- ctx_size: int,
266
- supports_embeddings: bool = False,
267
- supports_reranking: bool = False,
268
- ) -> subprocess.Popen:
269
- """
270
- Launch llama server subprocess with appropriate configuration.
271
-
272
- Args:
273
- snapshot_files: Dictionary of model files to load
274
- use_gpu: Whether to use GPU acceleration
275
- telemetry: Telemetry object for tracking performance metrics
276
- backend: Backend to use (e.g., 'vulkan', 'rocm')
277
- supports_embeddings: Whether the model supports embeddings
278
- supports_reranking: Whether the model supports reranking
279
-
280
- Returns:
281
- Subprocess handle for the llama server
282
- """
121
+ # Get the current executable path (handles both Windows and Ubuntu structures)
122
+ exe_path = get_llama_server_exe_path(self.backend)
283
123
 
284
- # Get the current executable path (handles both Windows and Ubuntu structures)
285
- exe_path = get_llama_server_exe_path(backend)
286
-
287
- # Build the base command
288
- base_command = [
289
- exe_path,
290
- "-m",
291
- snapshot_files["variant"],
292
- "--ctx-size",
293
- str(ctx_size),
294
- ]
295
-
296
- # Lock random seed for deterministic behavior in CI
297
- if os.environ.get("LEMONADE_CI_MODE"):
298
- base_command.extend(["--seed", "42"])
299
-
300
- if "mmproj" in snapshot_files:
301
- base_command.extend(["--mmproj", snapshot_files["mmproj"]])
302
- if not use_gpu:
303
- base_command.extend(["--no-mmproj-offload"])
304
-
305
- # Find a port, and save it in the telemetry object for future reference
306
- # by other functions
307
- telemetry.choose_port()
308
-
309
- # Add port and jinja to enable tool use
310
- base_command.extend(["--port", str(telemetry.port), "--jinja"])
311
-
312
- # Disable jinja for gpt-oss-120b on Vulkan
313
- if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
314
- base_command.remove("--jinja")
315
- logging.warning(
316
- "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
317
- "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
318
- "The model cannot use tools. If needed, use the ROCm backend instead."
319
- )
124
+ # Build the base command
125
+ base_command = [
126
+ exe_path,
127
+ "-m",
128
+ snapshot_files["variant"],
129
+ "--ctx-size",
130
+ str(ctx_size),
131
+ ]
320
132
 
321
- # Use legacy reasoning formatting, since not all apps support the new
322
- # reasoning_content field
323
- base_command.extend(["--reasoning-format", "none"])
324
-
325
- # Add embeddings support if the model supports it
326
- if supports_embeddings:
327
- base_command.append("--embeddings")
328
-
329
- # Add reranking support if the model supports it
330
- if supports_reranking:
331
- base_command.append("--reranking")
332
-
333
- # Configure GPU layers: 99 for GPU, 0 for CPU-only
334
- ngl_value = "99" if use_gpu else "0"
335
- command = base_command + ["-ngl", ngl_value]
336
-
337
- # Set up environment with library path for Linux
338
- env = os.environ.copy()
339
-
340
- # Load environment variables from .env file in the executable directory
341
- exe_dir = os.path.dirname(exe_path)
342
- env_file_path = os.path.join(exe_dir, ".env")
343
- if os.path.exists(env_file_path):
344
- load_dotenv(env_file_path, override=True)
345
- env.update(os.environ)
346
- logging.debug(f"Loaded environment variables from {env_file_path}")
347
-
348
- if platform.system().lower() == "linux":
349
- lib_dir = os.path.dirname(exe_path) # Same directory as the executable
350
- current_ld_path = env.get("LD_LIBRARY_PATH", "")
351
- if current_ld_path:
352
- env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
353
- else:
354
- env["LD_LIBRARY_PATH"] = lib_dir
355
- logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
356
-
357
- # Start subprocess with output capture
358
- process = subprocess.Popen(
359
- command,
360
- stdout=subprocess.PIPE,
361
- stderr=subprocess.STDOUT,
362
- text=True,
363
- encoding="utf-8",
364
- errors="replace",
365
- bufsize=1,
366
- env=env,
367
- )
368
-
369
- # Start background thread to log subprocess output
370
- device_type = "GPU" if use_gpu else "CPU"
371
- threading.Thread(
372
- target=_log_subprocess_output,
373
- args=(process, f"LLAMA SERVER {device_type}", telemetry),
374
- daemon=True,
375
- ).start()
376
-
377
- return process
378
-
379
-
380
- def server_load(
381
- model_config: PullConfig,
382
- telemetry: LlamaTelemetry,
383
- backend: str,
384
- ctx_size: int,
385
- do_not_upgrade: bool = False,
386
- ):
387
- # Install and/or update llama.cpp if needed
388
- try:
389
- install_llamacpp(backend)
390
- except NotImplementedError as e:
391
- raise HTTPException(
392
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
393
- )
133
+ # Lock random seed for deterministic behavior in CI
134
+ if os.environ.get("LEMONADE_CI_MODE"):
135
+ base_command.extend(["--seed", "42"])
136
+ logging.info(f"Seed applied to base command: {base_command}")
137
+
138
+ if "mmproj" in snapshot_files:
139
+ base_command.extend(["--mmproj", snapshot_files["mmproj"]])
140
+ if not use_gpu:
141
+ base_command.extend(["--no-mmproj-offload"])
142
+
143
+ # Find a port, and save it in the telemetry object for future reference
144
+ # by other functions
145
+ self.choose_port()
146
+
147
+ # Add port and jinja to enable tool use
148
+ base_command.extend(["--port", str(self.port), "--jinja"])
149
+
150
+ # Disable jinja for gpt-oss-120b on Vulkan
151
+ if (
152
+ self.backend == "vulkan"
153
+ and "gpt-oss-120b" in snapshot_files["variant"].lower()
154
+ ):
155
+ base_command.remove("--jinja")
156
+ logging.warning(
157
+ "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
158
+ "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
159
+ "The model cannot use tools. If needed, use the ROCm backend instead."
160
+ )
394
161
 
395
- # Download the gguf to the hugging face cache
396
- snapshot_files = download_gguf(
397
- model_config.checkpoint, model_config.mmproj, do_not_upgrade=do_not_upgrade
398
- )
399
- logging.debug(f"GGUF file paths: {snapshot_files}")
400
-
401
- # Check if model supports embeddings
402
- supported_models = ModelManager().supported_models
403
- model_info = supported_models.get(model_config.model_name, {})
404
- supports_embeddings = "embeddings" in model_info.get("labels", [])
405
- supports_reranking = "reranking" in model_info.get("labels", [])
406
-
407
- # Attempt loading on GPU first
408
- llama_server_process = _launch_llama_subprocess(
409
- snapshot_files,
410
- use_gpu=True,
411
- telemetry=telemetry,
412
- backend=backend,
413
- ctx_size=ctx_size,
414
- supports_embeddings=supports_embeddings,
415
- supports_reranking=supports_reranking,
416
- )
417
-
418
- # Check the /health endpoint until GPU server is ready
419
- _wait_for_load(
420
- llama_server_process,
421
- telemetry.port,
422
- )
423
-
424
- # If loading on GPU failed, try loading on CPU
425
- if llama_server_process.poll():
426
- logging.warning(
427
- f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
162
+ # Use legacy reasoning formatting, since not all apps support the new
163
+ # reasoning_content field
164
+ base_command.extend(["--reasoning-format", "none"])
165
+
166
+ # Add embeddings support if the model supports it
167
+ if supports_embeddings:
168
+ base_command.append("--embeddings")
169
+
170
+ # Add reranking support if the model supports it
171
+ if supports_reranking:
172
+ base_command.append("--reranking")
173
+
174
+ # Configure GPU layers: 99 for GPU, 0 for CPU-only
175
+ ngl_value = "99" if use_gpu else "0"
176
+ command = base_command + ["-ngl", ngl_value]
177
+
178
+ # Set up environment with library path for Linux
179
+ env = os.environ.copy()
180
+
181
+ # Load environment variables from .env file in the executable directory
182
+ exe_dir = os.path.dirname(exe_path)
183
+ env_file_path = os.path.join(exe_dir, ".env")
184
+ if os.path.exists(env_file_path):
185
+ load_dotenv(env_file_path, override=True)
186
+ env.update(os.environ)
187
+ logging.debug(f"Loaded environment variables from {env_file_path}")
188
+
189
+ if platform.system().lower() == "linux":
190
+ lib_dir = os.path.dirname(exe_path) # Same directory as the executable
191
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
192
+ if current_ld_path:
193
+ env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
194
+ else:
195
+ env["LD_LIBRARY_PATH"] = lib_dir
196
+ logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
197
+
198
+ # Start subprocess with output capture
199
+ self.process = subprocess.Popen(
200
+ command,
201
+ stdout=subprocess.PIPE,
202
+ stderr=subprocess.STDOUT,
203
+ text=True,
204
+ encoding="utf-8",
205
+ errors="replace",
206
+ bufsize=1,
207
+ env=env,
428
208
  )
429
209
 
430
- if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
431
- # Used for testing, when the test should fail if GPU didn't work
432
- raise Exception("llamacpp GPU loading failed")
433
-
434
- llama_server_process = _launch_llama_subprocess(
210
+ # Start background thread to log subprocess output
211
+ device_type = "GPU" if use_gpu else "CPU"
212
+ threading.Thread(
213
+ target=self._log_subprocess_output,
214
+ args=(f"LLAMA SERVER {device_type}",),
215
+ daemon=True,
216
+ ).start()
217
+
218
+ def _launch_server_subprocess(
219
+ self,
220
+ model_config: PullConfig,
221
+ snapshot_files: dict,
222
+ ctx_size: int,
223
+ supports_embeddings: bool = False,
224
+ supports_reranking: bool = False,
225
+ ):
226
+
227
+ # Attempt loading on GPU first
228
+ self._launch_device_backend_subprocess(
435
229
  snapshot_files,
436
- use_gpu=False,
437
- telemetry=telemetry,
438
- backend=backend,
230
+ use_gpu=True,
439
231
  ctx_size=ctx_size,
440
232
  supports_embeddings=supports_embeddings,
441
233
  supports_reranking=supports_reranking,
442
234
  )
443
235
 
444
- # Check the /health endpoint until CPU server is ready
445
- _wait_for_load(
446
- llama_server_process,
447
- telemetry.port,
448
- )
449
-
450
- if llama_server_process.poll():
451
- raise HTTPException(
452
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
453
- detail=f"Failed to load {model_config.model_name} with llama.cpp",
454
- )
236
+ # Check the /health endpoint until GPU server is ready
237
+ self._wait_for_load()
455
238
 
456
- return llama_server_process
457
-
458
-
459
- def chat_completion(
460
- chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
461
- ):
462
- base_url = llamacpp_address(telemetry.port)
463
- client = OpenAI(
464
- base_url=base_url,
465
- api_key="lemonade",
466
- )
467
-
468
- # Convert Pydantic model to dict and remove unset/null values
469
- request_dict = chat_completion_request.model_dump(
470
- exclude_unset=True, exclude_none=True
471
- )
472
-
473
- # Separate standard OpenAI parameters from custom llama.cpp parameters
474
- openai_client_params = _separate_openai_params(request_dict, "chat")
475
-
476
- # Check if streaming is requested
477
- if chat_completion_request.stream:
478
-
479
- def event_stream():
480
- try:
481
- # Enable streaming
482
- # pylint: disable=missing-kwoa
483
- for chunk in client.chat.completions.create(**openai_client_params):
484
- yield f"data: {chunk.model_dump_json()}\n\n"
485
- yield "data: [DONE]\n\n"
486
-
487
- # Show telemetry after completion
488
- telemetry.show_telemetry()
489
-
490
- except Exception as e: # pylint: disable=broad-exception-caught
491
- yield f'data: {{"error": "{str(e)}"}}\n\n'
492
-
493
- return StreamingResponse(
494
- event_stream(),
495
- media_type="text/event-stream",
496
- headers={
497
- "Cache-Control": "no-cache",
498
- "Connection": "keep-alive",
499
- },
500
- )
501
- else:
502
- # Non-streaming response
503
- try:
504
- # Disable streaming for non-streaming requests
505
- # pylint: disable=missing-kwoa
506
- response = client.chat.completions.create(**openai_client_params)
507
-
508
- # Show telemetry after completion
509
- telemetry.show_telemetry()
510
-
511
- return response
512
-
513
- except Exception as e: # pylint: disable=broad-exception-caught
514
- logging.error("Error during chat completion: %s", str(e))
515
- raise HTTPException(
516
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
517
- detail=f"Chat completion error: {str(e)}",
239
+ # If loading on GPU failed, try loading on CPU
240
+ if self.process.poll():
241
+ logging.warning(
242
+ f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
518
243
  )
519
244
 
245
+ if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
246
+ # Used for testing, when the test should fail if GPU didn't work
247
+ raise Exception("llamacpp GPU loading failed")
520
248
 
521
- def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
522
- """
523
- Handle text completions using the llamacpp server.
524
-
525
- Args:
526
- completion_request: The completion request containing prompt and parameters
527
- telemetry: Telemetry object containing the server port
528
-
529
- Returns:
530
- Completion response from the llamacpp server
531
- """
532
- base_url = llamacpp_address(telemetry.port)
533
- client = OpenAI(
534
- base_url=base_url,
535
- api_key="lemonade",
536
- )
537
-
538
- # Convert Pydantic model to dict and remove unset/null values
539
- request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
540
-
541
- # Separate standard OpenAI parameters from custom llama.cpp parameters
542
- openai_client_params = _separate_openai_params(request_dict, "completion")
543
-
544
- # Check if streaming is requested
545
- if completion_request.stream:
546
-
547
- def event_stream():
548
- try:
549
- # Enable streaming
550
- # pylint: disable=missing-kwoa
551
- for chunk in client.completions.create(**openai_client_params):
552
- yield f"data: {chunk.model_dump_json()}\n\n"
553
- yield "data: [DONE]\n\n"
554
-
555
- # Show telemetry after completion
556
- telemetry.show_telemetry()
557
-
558
- except Exception as e: # pylint: disable=broad-exception-caught
559
- yield f'data: {{"error": "{str(e)}"}}\n\n'
560
-
561
- return StreamingResponse(
562
- event_stream(),
563
- media_type="text/event-stream",
564
- headers={
565
- "Cache-Control": "no-cache",
566
- "Connection": "keep-alive",
567
- },
568
- )
569
- else:
570
- # Non-streaming response
571
- try:
572
- # Disable streaming for non-streaming requests
573
- # pylint: disable=missing-kwoa
574
- response = client.completions.create(**openai_client_params)
575
-
576
- # Show telemetry after completion
577
- telemetry.show_telemetry()
578
-
579
- return response
580
-
581
- except Exception as e: # pylint: disable=broad-exception-caught
582
- logging.error("Error during completion: %s", str(e))
583
- raise HTTPException(
584
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
585
- detail=f"Completion error: {str(e)}",
249
+ self._launch_device_backend_subprocess(
250
+ snapshot_files,
251
+ use_gpu=False,
252
+ ctx_size=ctx_size,
253
+ supports_embeddings=supports_embeddings,
254
+ supports_reranking=supports_reranking,
586
255
  )
587
-
588
-
589
- def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
590
- """
591
- Generate embeddings using the llamacpp server.
592
-
593
- Args:
594
- embeddings_request: The embeddings request containing input text/tokens
595
- telemetry: Telemetry object containing the server port
596
-
597
- Returns:
598
- Embeddings response from the llamacpp server
599
- """
600
- base_url = llamacpp_address(telemetry.port)
601
- client = OpenAI(
602
- base_url=base_url,
603
- api_key="lemonade",
604
- )
605
-
606
- # Convert Pydantic model to dict and remove unset/null values
607
- request_dict = embeddings_request.model_dump(exclude_unset=True, exclude_none=True)
608
-
609
- try:
610
- # Call the embeddings endpoint
611
- response = client.embeddings.create(**request_dict)
612
- return response
613
-
614
- except Exception as e: # pylint: disable=broad-exception-caught
615
- raise HTTPException(
616
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
617
- detail=f"Embeddings error: {str(e)}",
618
- )
619
-
620
-
621
- def reranking(reranking_request: RerankingRequest, telemetry: LlamaTelemetry):
622
- """
623
- Rerank documents based on their relevance to a query using the llamacpp server.
624
-
625
- Args:
626
- reranking_request: The reranking request containing query and documents
627
- telemetry: Telemetry object containing the server port
628
-
629
- Returns:
630
- Reranking response from the llamacpp server containing ranked documents and scores
631
- """
632
- base_url = llamacpp_address(telemetry.port)
633
-
634
- try:
635
- # Convert Pydantic model to dict and exclude unset/null values
636
- request_dict = reranking_request.model_dump(
637
- exclude_unset=True, exclude_none=True
638
- )
639
-
640
- # Call the reranking endpoint directly since it's not supported by the OpenAI API
641
- response = requests.post(
642
- f"{base_url}/rerank",
643
- json=request_dict,
644
- )
645
- response.raise_for_status()
646
- return response.json()
647
-
648
- except Exception as e:
649
- logging.error("Error during reranking: %s", str(e))
650
- raise HTTPException(
651
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
652
- detail=f"Reranking error: {str(e)}",
653
- ) from e