lemonade-sdk 8.0.3__tar.gz → 8.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (77) hide show
  1. {lemonade_sdk-8.0.3/src/lemonade_sdk.egg-info → lemonade_sdk-8.0.4}/PKG-INFO +5 -12
  2. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/README.md +4 -11
  3. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/humaneval.py +1 -1
  4. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/mmlu.py +1 -1
  5. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/load.py +1 -1
  6. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/perplexity.py +2 -2
  7. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/quark_load.py +1 -1
  8. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/quark_quantize.py +2 -2
  9. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/llamacpp.py +130 -9
  10. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/serve.py +73 -0
  11. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/styles.css +424 -4
  12. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/webapp.html +301 -35
  13. lemonade_sdk-8.0.4/src/lemonade/version.py +1 -0
  14. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4/src/lemonade_sdk.egg-info}/PKG-INFO +5 -12
  15. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_server/model_manager.py +12 -2
  16. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_server/pydantic_models.py +25 -1
  17. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_server/server_models.json +46 -44
  18. lemonade_sdk-8.0.3/src/lemonade/version.py +0 -1
  19. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/LICENSE +0 -0
  20. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/NOTICE.md +0 -0
  21. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/pyproject.toml +0 -0
  22. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/setup.cfg +0 -0
  23. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/setup.py +0 -0
  24. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/__init__.py +0 -0
  25. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/api.py +0 -0
  26. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/cache.py +0 -0
  27. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/cli.py +0 -0
  28. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/__init__.py +0 -0
  29. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/build.py +0 -0
  30. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/cli_helpers.py +0 -0
  31. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/exceptions.py +0 -0
  32. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/filesystem.py +0 -0
  33. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/network.py +0 -0
  34. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/printing.py +0 -0
  35. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/status.py +0 -0
  36. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/system_info.py +0 -0
  37. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/common/test_helpers.py +0 -0
  38. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/profilers/__init__.py +0 -0
  39. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/profilers/memory_tracker.py +0 -0
  40. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/profilers/profiler.py +0 -0
  41. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/sequence.py +0 -0
  42. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/state.py +0 -0
  43. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/__init__.py +0 -0
  44. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/accuracy.py +0 -0
  45. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/adapter.py +0 -0
  46. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/bench.py +0 -0
  47. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/bench.py +0 -0
  48. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/load.py +0 -0
  49. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/utils.py +0 -0
  50. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/llamacpp/bench.py +0 -0
  51. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/llamacpp/load.py +0 -0
  52. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/management_tools.py +0 -0
  53. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/__init__.py +0 -0
  54. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/bench.py +0 -0
  55. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/utils.py +0 -0
  56. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/prompt.py +0 -0
  57. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/__init__.py +0 -0
  58. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/__init__.py +0 -0
  59. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/llm_report.py +0 -0
  60. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/table.py +0 -0
  61. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/__init__.py +0 -0
  62. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/favicon.ico +0 -0
  63. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/tool_calls.py +0 -0
  64. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/tray.py +0 -0
  65. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/port.py +0 -0
  66. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/system_tray.py +0 -0
  67. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/thread.py +0 -0
  68. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/webapp.py +0 -0
  69. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade/tools/tool.py +0 -0
  70. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_install/__init__.py +0 -0
  71. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_install/install.py +0 -0
  72. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/SOURCES.txt +0 -0
  73. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  74. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  75. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/requires.txt +0 -0
  76. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
  77. {lemonade_sdk-8.0.3 → lemonade_sdk-8.0.4}/src/lemonade_server/cli.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.0.3
3
+ Version: 8.0.4
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.12
@@ -82,7 +82,7 @@ Dynamic: summary
82
82
 
83
83
  [![Lemonade tests](https://github.com/lemonade-sdk/lemonade/actions/workflows/test_lemonade.yml/badge.svg)](https://github.com/lemonade-sdk/lemonade/tree/main/test "Check out our tests")
84
84
  [![OS - Windows | Linux](https://img.shields.io/badge/OS-windows%20%7C%20linux-blue)](docs/README.md#installation "Check out our instructions")
85
- [![Made with Python](https://img.shields.io/badge/Python-3.8,3.10-blue?logo=python&logoColor=white)](docs/README.md#installation "Check out our instructions")
85
+ [![Made with Python](https://img.shields.io/badge/Python-3.10-blue?logo=python&logoColor=white)](docs/README.md#installation "Check out our instructions")
86
86
 
87
87
  ## 🍋 Lemonade SDK: Quickly serve, benchmark and deploy LLMs
88
88
 
@@ -97,8 +97,8 @@ The [Lemonade SDK](./docs/README.md) makes it easy to run Large Language Models
97
97
  The [Lemonade SDK](./docs/README.md) is comprised of the following:
98
98
 
99
99
  - 🌐 **[Lemonade Server](https://lemonade-server.ai/docs)**: A local LLM server for running ONNX and GGUF models using the OpenAI API standard. Install and enable your applications with NPU and GPU acceleration in minutes.
100
- - 🐍 **Lemonade API**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
101
- - 🖥️ **Lemonade CLI**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
100
+ - 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
101
+ - 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
102
102
  - Prompting with templates.
103
103
  - Measuring accuracy with a variety of tests.
104
104
  - Benchmarking to get the time-to-first-token and tokens per second.
@@ -153,14 +153,7 @@ Maximum LLM performance requires the right hardware accelerator with the right i
153
153
  </tbody>
154
154
  </table>
155
155
 
156
-
157
-
158
- #### Inference Engines Overview
159
- | Engine | Description |
160
- | :--- | :--- |
161
- | **OnnxRuntime GenAI (OGA)** | Microsoft engine that runs `.onnx` models and enables hardware vendors to provide their own execution providers (EPs) to support specialized hardware, such as neural processing units (NPUs). |
162
- | **llamacpp** | Community-driven engine with strong GPU acceleration, support for thousands of `.gguf` models, and advanced features such as vision-language models (VLMs) and mixture-of-experts (MoEs). |
163
- | **Hugging Face (HF)** | Hugging Face's `transformers` library can run the original `.safetensors` trained weights for models on Meta's PyTorch engine, which provides a source of truth for accuracy measurement. |
156
+ To learn more about the supported hardware and software, visit the documentation [here](./docs/README.md#software-and-hardware-overview).
164
157
 
165
158
  ## Integrate Lemonade Server with Your Application
166
159
 
@@ -1,6 +1,6 @@
1
1
  [![Lemonade tests](https://github.com/lemonade-sdk/lemonade/actions/workflows/test_lemonade.yml/badge.svg)](https://github.com/lemonade-sdk/lemonade/tree/main/test "Check out our tests")
2
2
  [![OS - Windows | Linux](https://img.shields.io/badge/OS-windows%20%7C%20linux-blue)](docs/README.md#installation "Check out our instructions")
3
- [![Made with Python](https://img.shields.io/badge/Python-3.8,3.10-blue?logo=python&logoColor=white)](docs/README.md#installation "Check out our instructions")
3
+ [![Made with Python](https://img.shields.io/badge/Python-3.10-blue?logo=python&logoColor=white)](docs/README.md#installation "Check out our instructions")
4
4
 
5
5
  ## 🍋 Lemonade SDK: Quickly serve, benchmark and deploy LLMs
6
6
 
@@ -15,8 +15,8 @@ The [Lemonade SDK](./docs/README.md) makes it easy to run Large Language Models
15
15
  The [Lemonade SDK](./docs/README.md) is comprised of the following:
16
16
 
17
17
  - 🌐 **[Lemonade Server](https://lemonade-server.ai/docs)**: A local LLM server for running ONNX and GGUF models using the OpenAI API standard. Install and enable your applications with NPU and GPU acceleration in minutes.
18
- - 🐍 **Lemonade API**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
19
- - 🖥️ **Lemonade CLI**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
18
+ - 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
19
+ - 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
20
20
  - Prompting with templates.
21
21
  - Measuring accuracy with a variety of tests.
22
22
  - Benchmarking to get the time-to-first-token and tokens per second.
@@ -71,14 +71,7 @@ Maximum LLM performance requires the right hardware accelerator with the right i
71
71
  </tbody>
72
72
  </table>
73
73
 
74
-
75
-
76
- #### Inference Engines Overview
77
- | Engine | Description |
78
- | :--- | :--- |
79
- | **OnnxRuntime GenAI (OGA)** | Microsoft engine that runs `.onnx` models and enables hardware vendors to provide their own execution providers (EPs) to support specialized hardware, such as neural processing units (NPUs). |
80
- | **llamacpp** | Community-driven engine with strong GPU acceleration, support for thousands of `.gguf` models, and advanced features such as vision-language models (VLMs) and mixture-of-experts (MoEs). |
81
- | **Hugging Face (HF)** | Hugging Face's `transformers` library can run the original `.safetensors` trained weights for models on Meta's PyTorch engine, which provides a source of truth for accuracy measurement. |
74
+ To learn more about the supported hardware and software, visit the documentation [here](./docs/README.md#software-and-hardware-overview).
82
75
 
83
76
  ## Integrate Lemonade Server with Your Application
84
77
 
@@ -24,7 +24,7 @@ class AccuracyHumaneval(Tool):
24
24
  - pass@10: Percentage of problems solved within 10 generation attempts
25
25
  - pass@100: Percentage of problems solved within 100 generation attempts
26
26
 
27
- See docs/lemonade/humaneval_accuracy.md for more details
27
+ See docs/dev_cli/humaneval_accuracy.md for more details
28
28
  """
29
29
 
30
30
  unique_name = "accuracy-humaneval"
@@ -27,7 +27,7 @@ def min_handle_none(*args: int):
27
27
 
28
28
  class AccuracyMMLU(Tool):
29
29
  """
30
- See docs/lemonade/mmlu_accuracy.md for more details
30
+ See docs/dev_cli/mmlu_accuracy.md for more details
31
31
  """
32
32
 
33
33
  unique_name = "accuracy-mmlu"
@@ -58,7 +58,7 @@ class OgaLoad(FirstTool):
58
58
  Input: path to a checkpoint.
59
59
  Supported choices for cpu and igpu from HF model repository:
60
60
  LLM models on Huggingface supported by model_builder. See documentation
61
- (https://github.com/lemonade-sdk/lemonade/blob/main/docs/ort_genai_igpu.md)
61
+ (https://github.com/lemonade-sdk/lemonade/blob/main/docs/dev_cli/ort_genai_igpu.md)
62
62
  for supported models.
63
63
  Supported choices for npu from HF model repository:
64
64
  Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
@@ -17,7 +17,7 @@ class AccuracyPerplexity(Tool):
17
17
 
18
18
  Output state produced: None
19
19
 
20
- See docs/lemonade/perplexity.md for more details.
20
+ See docs/dev_cli/perplexity.md for more details.
21
21
  """
22
22
 
23
23
  unique_name = "accuracy-perplexity"
@@ -63,7 +63,7 @@ class AccuracyPerplexity(Tool):
63
63
  # try-except will allow a few more LLMs to work
64
64
  max_length = 2048
65
65
  # Set stride to half of the maximum input length for overlapping window processing
66
- # Refer to docs/perplexity.md for more information on sliding window
66
+ # Refer to docs/dev_cli/perplexity.md for more information on sliding window
67
67
  stride = max_length // 2
68
68
  # Determine the total sequence length of the tokenized input
69
69
  seq_len = encodings.input_ids.size(1)
@@ -18,7 +18,7 @@ class QuarkLoad(Tool):
18
18
  Output:
19
19
  - state of the loaded model
20
20
 
21
- See docs/quark.md for more details.
21
+ See docs/dev_cli/quark.md for more details.
22
22
  """
23
23
 
24
24
  unique_name = "quark-load"
@@ -25,7 +25,7 @@ class QuarkQuantize(Tool):
25
25
  Output:
26
26
  - Modifies `state` with quantized and optionally exported model.
27
27
 
28
- See docs/quark.md for more details.
28
+ See docs/dev_cli/quark.md for more details.
29
29
  """
30
30
 
31
31
  unique_name = "quark-quantize"
@@ -94,7 +94,7 @@ class QuarkQuantize(Tool):
94
94
  help="Number of samples for calibration.",
95
95
  )
96
96
 
97
- # See docs/quark.md for more details.
97
+ # See docs/dev_cli/quark.md for more details.
98
98
  parser.add_argument(
99
99
  "--quant-scheme",
100
100
  type=str,
@@ -16,11 +16,29 @@ from fastapi.responses import StreamingResponse
16
16
 
17
17
  from openai import OpenAI
18
18
 
19
- from lemonade_server.pydantic_models import ChatCompletionRequest, PullConfig
19
+ from lemonade_server.pydantic_models import (
20
+ ChatCompletionRequest,
21
+ PullConfig,
22
+ EmbeddingsRequest,
23
+ RerankingRequest,
24
+ )
20
25
  from lemonade_server.model_manager import ModelManager
21
26
  from lemonade.tools.server.utils.port import find_free_port
22
27
 
23
- LLAMA_VERSION = "b5699"
28
+ LLAMA_VERSION = "b5787"
29
+
30
+
31
+ def llamacpp_address(port: int) -> str:
32
+ """
33
+ Generate the base URL for the llamacpp server.
34
+
35
+ Args:
36
+ port: The port number the llamacpp server is running on
37
+
38
+ Returns:
39
+ The base URL for the llamacpp server
40
+ """
41
+ return f"http://127.0.0.1:{port}/v1"
24
42
 
25
43
 
26
44
  def get_llama_server_paths():
@@ -244,10 +262,24 @@ def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
244
262
 
245
263
 
246
264
  def _launch_llama_subprocess(
247
- snapshot_files: dict, use_gpu: bool, telemetry: LlamaTelemetry
265
+ snapshot_files: dict,
266
+ use_gpu: bool,
267
+ telemetry: LlamaTelemetry,
268
+ supports_embeddings: bool = False,
269
+ supports_reranking: bool = False,
248
270
  ) -> subprocess.Popen:
249
271
  """
250
- Launch llama server subprocess with GPU or CPU configuration
272
+ Launch llama server subprocess with appropriate configuration.
273
+
274
+ Args:
275
+ snapshot_files: Dictionary of model files to load
276
+ use_gpu: Whether to use GPU acceleration
277
+ telemetry: Telemetry object for tracking performance metrics
278
+ supports_embeddings: Whether the model supports embeddings
279
+ supports_reranking: Whether the model supports reranking
280
+
281
+ Returns:
282
+ Subprocess handle for the llama server
251
283
  """
252
284
 
253
285
  # Get the current executable path (handles both Windows and Ubuntu structures)
@@ -271,6 +303,14 @@ def _launch_llama_subprocess(
271
303
  # reasoning_content field
272
304
  base_command.extend(["--reasoning-format", "none"])
273
305
 
306
+ # Add embeddings support if the model supports it
307
+ if supports_embeddings:
308
+ base_command.append("--embeddings")
309
+
310
+ # Add reranking support if the model supports it
311
+ if supports_reranking:
312
+ base_command.append("--reranking")
313
+
274
314
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
275
315
  ngl_value = "99" if use_gpu else "0"
276
316
  command = base_command + ["-ngl", ngl_value]
@@ -310,7 +350,6 @@ def _launch_llama_subprocess(
310
350
 
311
351
 
312
352
  def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
313
-
314
353
  # Validate platform support before proceeding
315
354
  validate_platform_support()
316
355
 
@@ -367,15 +406,26 @@ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
367
406
  logging.info("Cleaned up zip file")
368
407
 
369
408
  # Download the gguf to the hugging face cache
370
- snapshot_files = ModelManager().download_gguf(model_config)
409
+ model_manager = ModelManager()
410
+ snapshot_files = model_manager.download_gguf(model_config)
371
411
  logging.debug(f"GGUF file paths: {snapshot_files}")
372
412
 
413
+ # Check if model supports embeddings
414
+ supported_models = model_manager.supported_models
415
+ model_info = supported_models.get(model_config.model_name, {})
416
+ supports_embeddings = "embeddings" in model_info.get("labels", [])
417
+ supports_reranking = "reranking" in model_info.get("labels", [])
418
+
373
419
  # Start the llama-serve.exe process
374
420
  logging.debug(f"Using llama_server for GGUF model: {llama_server_exe_path}")
375
421
 
376
422
  # Attempt loading on GPU first
377
423
  llama_server_process = _launch_llama_subprocess(
378
- snapshot_files, use_gpu=True, telemetry=telemetry
424
+ snapshot_files,
425
+ use_gpu=True,
426
+ telemetry=telemetry,
427
+ supports_embeddings=supports_embeddings,
428
+ supports_reranking=supports_reranking,
379
429
  )
380
430
 
381
431
  # Check the /health endpoint until GPU server is ready
@@ -395,7 +445,11 @@ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
395
445
  raise Exception("llamacpp GPU loading failed")
396
446
 
397
447
  llama_server_process = _launch_llama_subprocess(
398
- snapshot_files, use_gpu=False, telemetry=telemetry
448
+ snapshot_files,
449
+ use_gpu=False,
450
+ telemetry=telemetry,
451
+ supports_embeddings=supports_embeddings,
452
+ supports_reranking=supports_reranking,
399
453
  )
400
454
 
401
455
  # Check the /health endpoint until CPU server is ready
@@ -416,7 +470,7 @@ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
416
470
  def chat_completion(
417
471
  chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
418
472
  ):
419
- base_url = f"http://127.0.0.1:{telemetry.port}/v1"
473
+ base_url = llamacpp_address(telemetry.port)
420
474
  client = OpenAI(
421
475
  base_url=base_url,
422
476
  api_key="lemonade",
@@ -467,3 +521,70 @@ def chat_completion(
467
521
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
468
522
  detail=f"Chat completion error: {str(e)}",
469
523
  )
524
+
525
+
526
+ def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
527
+ """
528
+ Generate embeddings using the llamacpp server.
529
+
530
+ Args:
531
+ embeddings_request: The embeddings request containing input text/tokens
532
+ telemetry: Telemetry object containing the server port
533
+
534
+ Returns:
535
+ Embeddings response from the llamacpp server
536
+ """
537
+ base_url = llamacpp_address(telemetry.port)
538
+ client = OpenAI(
539
+ base_url=base_url,
540
+ api_key="lemonade",
541
+ )
542
+
543
+ # Convert Pydantic model to dict and remove unset/null values
544
+ request_dict = embeddings_request.model_dump(exclude_unset=True, exclude_none=True)
545
+
546
+ try:
547
+ # Call the embeddings endpoint
548
+ response = client.embeddings.create(**request_dict)
549
+ return response
550
+
551
+ except Exception as e: # pylint: disable=broad-exception-caught
552
+ raise HTTPException(
553
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
554
+ detail=f"Embeddings error: {str(e)}",
555
+ )
556
+
557
+
558
+ def reranking(reranking_request: RerankingRequest, telemetry: LlamaTelemetry):
559
+ """
560
+ Rerank documents based on their relevance to a query using the llamacpp server.
561
+
562
+ Args:
563
+ reranking_request: The reranking request containing query and documents
564
+ telemetry: Telemetry object containing the server port
565
+
566
+ Returns:
567
+ Reranking response from the llamacpp server containing ranked documents and scores
568
+ """
569
+ base_url = llamacpp_address(telemetry.port)
570
+
571
+ try:
572
+ # Convert Pydantic model to dict and exclude unset/null values
573
+ request_dict = reranking_request.model_dump(
574
+ exclude_unset=True, exclude_none=True
575
+ )
576
+
577
+ # Call the reranking endpoint directly since it's not supported by the OpenAI API
578
+ response = requests.post(
579
+ f"{base_url}/rerank",
580
+ json=request_dict,
581
+ )
582
+ response.raise_for_status()
583
+ return response.json()
584
+
585
+ except Exception as e:
586
+ logging.error("Error during reranking: %s", str(e))
587
+ raise HTTPException(
588
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
589
+ detail=f"Reranking error: {str(e)}",
590
+ ) from e
@@ -54,6 +54,8 @@ from lemonade_server.pydantic_models import (
54
54
  LoadConfig,
55
55
  CompletionRequest,
56
56
  ChatCompletionRequest,
57
+ EmbeddingsRequest,
58
+ RerankingRequest,
57
59
  ResponsesRequest,
58
60
  PullConfig,
59
61
  DeleteConfig,
@@ -231,8 +233,13 @@ class Server(ManagementTool):
231
233
 
232
234
  # OpenAI-compatible routes
233
235
  self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
236
+ self.app.post(f"{prefix}/embeddings")(self.embeddings)
234
237
  self.app.get(f"{prefix}/models")(self.models)
235
238
 
239
+ # JinaAI routes (jina.ai/reranker/)
240
+ self.app.post(f"{prefix}/reranking")(self.reranking)
241
+ self.app.post(f"{prefix}/rerank")(self.reranking)
242
+
236
243
  @staticmethod
237
244
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
238
245
  parser = __class__.helpful_parser(
@@ -796,6 +803,72 @@ class Server(ManagementTool):
796
803
  created=int(time.time()),
797
804
  )
798
805
 
806
+ async def embeddings(self, embeddings_request: EmbeddingsRequest):
807
+ """
808
+ Generate embeddings for the provided input.
809
+ """
810
+ # Initialize load config from embeddings request
811
+ lc = LoadConfig(model_name=embeddings_request.model)
812
+
813
+ # Load the model if it's different from the currently loaded one
814
+ await self.load_llm(lc)
815
+
816
+ if self.llm_loaded.recipe == "llamacpp":
817
+ try:
818
+ return llamacpp.embeddings(embeddings_request, self.llama_telemetry)
819
+ except Exception as e: # pylint: disable=broad-exception-caught
820
+ # Check if model has embeddings label
821
+ model_info = ModelManager().supported_models.get(
822
+ self.llm_loaded.model_name, {}
823
+ )
824
+ if "embeddings" not in model_info.get("labels", []):
825
+ raise HTTPException(
826
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
827
+ detail="You tried to generate embeddings for a model that is "
828
+ "not labeled as an embeddings model. Please use another model "
829
+ "or re-register the current model with the 'embeddings' label.",
830
+ ) from e
831
+ else:
832
+ raise e
833
+ else:
834
+ raise HTTPException(
835
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
836
+ detail=f"Embeddings not supported for recipe: {self.llm_loaded.recipe}",
837
+ )
838
+
839
+ async def reranking(self, reranking_request: RerankingRequest):
840
+ """
841
+ Rerank documents based on their relevance to a query using the llamacpp server.
842
+ """
843
+ # Initialize load config from reranking request
844
+ lc = LoadConfig(model_name=reranking_request.model)
845
+
846
+ # Load the model if it's different from the currently loaded one
847
+ await self.load_llm(lc)
848
+
849
+ if self.llm_loaded.recipe == "llamacpp":
850
+ try:
851
+ return llamacpp.reranking(reranking_request, self.llama_telemetry)
852
+ except Exception as e: # pylint: disable=broad-exception-caught
853
+ # Check if model has reranking label
854
+ model_info = ModelManager().supported_models.get(
855
+ self.llm_loaded.model_name, {}
856
+ )
857
+ if "reranking" not in model_info.get("labels", []):
858
+ raise HTTPException(
859
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
860
+ detail="You tried to use reranking for a model that is "
861
+ "not labeled as a reranking model. Please use another model "
862
+ "or re-register the current model with the 'reranking' label.",
863
+ ) from e
864
+ else:
865
+ raise e
866
+ else:
867
+ raise HTTPException(
868
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
869
+ detail=f"Reranking not supported for recipe: {self.llm_loaded.recipe}",
870
+ )
871
+
799
872
  def apply_chat_template(
800
873
  self, messages: list[dict], tools: list[dict] | None = None
801
874
  ):