lemonade-sdk 8.0.3__py3-none-any.whl → 8.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/humaneval.py +1 -1
- lemonade/tools/mmlu.py +1 -1
- lemonade/tools/oga/load.py +1 -1
- lemonade/tools/perplexity.py +2 -2
- lemonade/tools/quark/quark_load.py +1 -1
- lemonade/tools/quark/quark_quantize.py +2 -2
- lemonade/tools/server/llamacpp.py +130 -9
- lemonade/tools/server/serve.py +73 -0
- lemonade/tools/server/static/styles.css +424 -4
- lemonade/tools/server/static/webapp.html +301 -35
- lemonade/version.py +1 -1
- {lemonade_sdk-8.0.3.dist-info → lemonade_sdk-8.0.4.dist-info}/METADATA +5 -12
- {lemonade_sdk-8.0.3.dist-info → lemonade_sdk-8.0.4.dist-info}/RECORD +21 -21
- lemonade_server/model_manager.py +12 -2
- lemonade_server/pydantic_models.py +25 -1
- lemonade_server/server_models.json +46 -44
- {lemonade_sdk-8.0.3.dist-info → lemonade_sdk-8.0.4.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.0.3.dist-info → lemonade_sdk-8.0.4.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.0.3.dist-info → lemonade_sdk-8.0.4.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.0.3.dist-info → lemonade_sdk-8.0.4.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.0.3.dist-info → lemonade_sdk-8.0.4.dist-info}/top_level.txt +0 -0
lemonade/tools/humaneval.py
CHANGED
|
@@ -24,7 +24,7 @@ class AccuracyHumaneval(Tool):
|
|
|
24
24
|
- pass@10: Percentage of problems solved within 10 generation attempts
|
|
25
25
|
- pass@100: Percentage of problems solved within 100 generation attempts
|
|
26
26
|
|
|
27
|
-
See docs/
|
|
27
|
+
See docs/dev_cli/humaneval_accuracy.md for more details
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
30
|
unique_name = "accuracy-humaneval"
|
lemonade/tools/mmlu.py
CHANGED
lemonade/tools/oga/load.py
CHANGED
|
@@ -58,7 +58,7 @@ class OgaLoad(FirstTool):
|
|
|
58
58
|
Input: path to a checkpoint.
|
|
59
59
|
Supported choices for cpu and igpu from HF model repository:
|
|
60
60
|
LLM models on Huggingface supported by model_builder. See documentation
|
|
61
|
-
(https://github.com/lemonade-sdk/lemonade/blob/main/docs/ort_genai_igpu.md)
|
|
61
|
+
(https://github.com/lemonade-sdk/lemonade/blob/main/docs/dev_cli/ort_genai_igpu.md)
|
|
62
62
|
for supported models.
|
|
63
63
|
Supported choices for npu from HF model repository:
|
|
64
64
|
Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
|
lemonade/tools/perplexity.py
CHANGED
|
@@ -17,7 +17,7 @@ class AccuracyPerplexity(Tool):
|
|
|
17
17
|
|
|
18
18
|
Output state produced: None
|
|
19
19
|
|
|
20
|
-
See docs/
|
|
20
|
+
See docs/dev_cli/perplexity.md for more details.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
unique_name = "accuracy-perplexity"
|
|
@@ -63,7 +63,7 @@ class AccuracyPerplexity(Tool):
|
|
|
63
63
|
# try-except will allow a few more LLMs to work
|
|
64
64
|
max_length = 2048
|
|
65
65
|
# Set stride to half of the maximum input length for overlapping window processing
|
|
66
|
-
# Refer to docs/perplexity.md for more information on sliding window
|
|
66
|
+
# Refer to docs/dev_cli/perplexity.md for more information on sliding window
|
|
67
67
|
stride = max_length // 2
|
|
68
68
|
# Determine the total sequence length of the tokenized input
|
|
69
69
|
seq_len = encodings.input_ids.size(1)
|
|
@@ -25,7 +25,7 @@ class QuarkQuantize(Tool):
|
|
|
25
25
|
Output:
|
|
26
26
|
- Modifies `state` with quantized and optionally exported model.
|
|
27
27
|
|
|
28
|
-
See docs/quark.md for more details.
|
|
28
|
+
See docs/dev_cli/quark.md for more details.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
31
|
unique_name = "quark-quantize"
|
|
@@ -94,7 +94,7 @@ class QuarkQuantize(Tool):
|
|
|
94
94
|
help="Number of samples for calibration.",
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
-
# See docs/quark.md for more details.
|
|
97
|
+
# See docs/dev_cli/quark.md for more details.
|
|
98
98
|
parser.add_argument(
|
|
99
99
|
"--quant-scheme",
|
|
100
100
|
type=str,
|
|
@@ -16,11 +16,29 @@ from fastapi.responses import StreamingResponse
|
|
|
16
16
|
|
|
17
17
|
from openai import OpenAI
|
|
18
18
|
|
|
19
|
-
from lemonade_server.pydantic_models import
|
|
19
|
+
from lemonade_server.pydantic_models import (
|
|
20
|
+
ChatCompletionRequest,
|
|
21
|
+
PullConfig,
|
|
22
|
+
EmbeddingsRequest,
|
|
23
|
+
RerankingRequest,
|
|
24
|
+
)
|
|
20
25
|
from lemonade_server.model_manager import ModelManager
|
|
21
26
|
from lemonade.tools.server.utils.port import find_free_port
|
|
22
27
|
|
|
23
|
-
LLAMA_VERSION = "
|
|
28
|
+
LLAMA_VERSION = "b5787"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def llamacpp_address(port: int) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Generate the base URL for the llamacpp server.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
port: The port number the llamacpp server is running on
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The base URL for the llamacpp server
|
|
40
|
+
"""
|
|
41
|
+
return f"http://127.0.0.1:{port}/v1"
|
|
24
42
|
|
|
25
43
|
|
|
26
44
|
def get_llama_server_paths():
|
|
@@ -244,10 +262,24 @@ def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
|
|
|
244
262
|
|
|
245
263
|
|
|
246
264
|
def _launch_llama_subprocess(
|
|
247
|
-
snapshot_files: dict,
|
|
265
|
+
snapshot_files: dict,
|
|
266
|
+
use_gpu: bool,
|
|
267
|
+
telemetry: LlamaTelemetry,
|
|
268
|
+
supports_embeddings: bool = False,
|
|
269
|
+
supports_reranking: bool = False,
|
|
248
270
|
) -> subprocess.Popen:
|
|
249
271
|
"""
|
|
250
|
-
Launch llama server subprocess with
|
|
272
|
+
Launch llama server subprocess with appropriate configuration.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
snapshot_files: Dictionary of model files to load
|
|
276
|
+
use_gpu: Whether to use GPU acceleration
|
|
277
|
+
telemetry: Telemetry object for tracking performance metrics
|
|
278
|
+
supports_embeddings: Whether the model supports embeddings
|
|
279
|
+
supports_reranking: Whether the model supports reranking
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Subprocess handle for the llama server
|
|
251
283
|
"""
|
|
252
284
|
|
|
253
285
|
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
@@ -271,6 +303,14 @@ def _launch_llama_subprocess(
|
|
|
271
303
|
# reasoning_content field
|
|
272
304
|
base_command.extend(["--reasoning-format", "none"])
|
|
273
305
|
|
|
306
|
+
# Add embeddings support if the model supports it
|
|
307
|
+
if supports_embeddings:
|
|
308
|
+
base_command.append("--embeddings")
|
|
309
|
+
|
|
310
|
+
# Add reranking support if the model supports it
|
|
311
|
+
if supports_reranking:
|
|
312
|
+
base_command.append("--reranking")
|
|
313
|
+
|
|
274
314
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
275
315
|
ngl_value = "99" if use_gpu else "0"
|
|
276
316
|
command = base_command + ["-ngl", ngl_value]
|
|
@@ -310,7 +350,6 @@ def _launch_llama_subprocess(
|
|
|
310
350
|
|
|
311
351
|
|
|
312
352
|
def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
313
|
-
|
|
314
353
|
# Validate platform support before proceeding
|
|
315
354
|
validate_platform_support()
|
|
316
355
|
|
|
@@ -367,15 +406,26 @@ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
|
367
406
|
logging.info("Cleaned up zip file")
|
|
368
407
|
|
|
369
408
|
# Download the gguf to the hugging face cache
|
|
370
|
-
|
|
409
|
+
model_manager = ModelManager()
|
|
410
|
+
snapshot_files = model_manager.download_gguf(model_config)
|
|
371
411
|
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
372
412
|
|
|
413
|
+
# Check if model supports embeddings
|
|
414
|
+
supported_models = model_manager.supported_models
|
|
415
|
+
model_info = supported_models.get(model_config.model_name, {})
|
|
416
|
+
supports_embeddings = "embeddings" in model_info.get("labels", [])
|
|
417
|
+
supports_reranking = "reranking" in model_info.get("labels", [])
|
|
418
|
+
|
|
373
419
|
# Start the llama-serve.exe process
|
|
374
420
|
logging.debug(f"Using llama_server for GGUF model: {llama_server_exe_path}")
|
|
375
421
|
|
|
376
422
|
# Attempt loading on GPU first
|
|
377
423
|
llama_server_process = _launch_llama_subprocess(
|
|
378
|
-
snapshot_files,
|
|
424
|
+
snapshot_files,
|
|
425
|
+
use_gpu=True,
|
|
426
|
+
telemetry=telemetry,
|
|
427
|
+
supports_embeddings=supports_embeddings,
|
|
428
|
+
supports_reranking=supports_reranking,
|
|
379
429
|
)
|
|
380
430
|
|
|
381
431
|
# Check the /health endpoint until GPU server is ready
|
|
@@ -395,7 +445,11 @@ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
|
395
445
|
raise Exception("llamacpp GPU loading failed")
|
|
396
446
|
|
|
397
447
|
llama_server_process = _launch_llama_subprocess(
|
|
398
|
-
snapshot_files,
|
|
448
|
+
snapshot_files,
|
|
449
|
+
use_gpu=False,
|
|
450
|
+
telemetry=telemetry,
|
|
451
|
+
supports_embeddings=supports_embeddings,
|
|
452
|
+
supports_reranking=supports_reranking,
|
|
399
453
|
)
|
|
400
454
|
|
|
401
455
|
# Check the /health endpoint until CPU server is ready
|
|
@@ -416,7 +470,7 @@ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
|
416
470
|
def chat_completion(
|
|
417
471
|
chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
|
|
418
472
|
):
|
|
419
|
-
base_url =
|
|
473
|
+
base_url = llamacpp_address(telemetry.port)
|
|
420
474
|
client = OpenAI(
|
|
421
475
|
base_url=base_url,
|
|
422
476
|
api_key="lemonade",
|
|
@@ -467,3 +521,70 @@ def chat_completion(
|
|
|
467
521
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
468
522
|
detail=f"Chat completion error: {str(e)}",
|
|
469
523
|
)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
|
|
527
|
+
"""
|
|
528
|
+
Generate embeddings using the llamacpp server.
|
|
529
|
+
|
|
530
|
+
Args:
|
|
531
|
+
embeddings_request: The embeddings request containing input text/tokens
|
|
532
|
+
telemetry: Telemetry object containing the server port
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
Embeddings response from the llamacpp server
|
|
536
|
+
"""
|
|
537
|
+
base_url = llamacpp_address(telemetry.port)
|
|
538
|
+
client = OpenAI(
|
|
539
|
+
base_url=base_url,
|
|
540
|
+
api_key="lemonade",
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
# Convert Pydantic model to dict and remove unset/null values
|
|
544
|
+
request_dict = embeddings_request.model_dump(exclude_unset=True, exclude_none=True)
|
|
545
|
+
|
|
546
|
+
try:
|
|
547
|
+
# Call the embeddings endpoint
|
|
548
|
+
response = client.embeddings.create(**request_dict)
|
|
549
|
+
return response
|
|
550
|
+
|
|
551
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
552
|
+
raise HTTPException(
|
|
553
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
554
|
+
detail=f"Embeddings error: {str(e)}",
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def reranking(reranking_request: RerankingRequest, telemetry: LlamaTelemetry):
|
|
559
|
+
"""
|
|
560
|
+
Rerank documents based on their relevance to a query using the llamacpp server.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
reranking_request: The reranking request containing query and documents
|
|
564
|
+
telemetry: Telemetry object containing the server port
|
|
565
|
+
|
|
566
|
+
Returns:
|
|
567
|
+
Reranking response from the llamacpp server containing ranked documents and scores
|
|
568
|
+
"""
|
|
569
|
+
base_url = llamacpp_address(telemetry.port)
|
|
570
|
+
|
|
571
|
+
try:
|
|
572
|
+
# Convert Pydantic model to dict and exclude unset/null values
|
|
573
|
+
request_dict = reranking_request.model_dump(
|
|
574
|
+
exclude_unset=True, exclude_none=True
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# Call the reranking endpoint directly since it's not supported by the OpenAI API
|
|
578
|
+
response = requests.post(
|
|
579
|
+
f"{base_url}/rerank",
|
|
580
|
+
json=request_dict,
|
|
581
|
+
)
|
|
582
|
+
response.raise_for_status()
|
|
583
|
+
return response.json()
|
|
584
|
+
|
|
585
|
+
except Exception as e:
|
|
586
|
+
logging.error("Error during reranking: %s", str(e))
|
|
587
|
+
raise HTTPException(
|
|
588
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
589
|
+
detail=f"Reranking error: {str(e)}",
|
|
590
|
+
) from e
|
lemonade/tools/server/serve.py
CHANGED
|
@@ -54,6 +54,8 @@ from lemonade_server.pydantic_models import (
|
|
|
54
54
|
LoadConfig,
|
|
55
55
|
CompletionRequest,
|
|
56
56
|
ChatCompletionRequest,
|
|
57
|
+
EmbeddingsRequest,
|
|
58
|
+
RerankingRequest,
|
|
57
59
|
ResponsesRequest,
|
|
58
60
|
PullConfig,
|
|
59
61
|
DeleteConfig,
|
|
@@ -231,8 +233,13 @@ class Server(ManagementTool):
|
|
|
231
233
|
|
|
232
234
|
# OpenAI-compatible routes
|
|
233
235
|
self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
|
|
236
|
+
self.app.post(f"{prefix}/embeddings")(self.embeddings)
|
|
234
237
|
self.app.get(f"{prefix}/models")(self.models)
|
|
235
238
|
|
|
239
|
+
# JinaAI routes (jina.ai/reranker/)
|
|
240
|
+
self.app.post(f"{prefix}/reranking")(self.reranking)
|
|
241
|
+
self.app.post(f"{prefix}/rerank")(self.reranking)
|
|
242
|
+
|
|
236
243
|
@staticmethod
|
|
237
244
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
238
245
|
parser = __class__.helpful_parser(
|
|
@@ -796,6 +803,72 @@ class Server(ManagementTool):
|
|
|
796
803
|
created=int(time.time()),
|
|
797
804
|
)
|
|
798
805
|
|
|
806
|
+
async def embeddings(self, embeddings_request: EmbeddingsRequest):
|
|
807
|
+
"""
|
|
808
|
+
Generate embeddings for the provided input.
|
|
809
|
+
"""
|
|
810
|
+
# Initialize load config from embeddings request
|
|
811
|
+
lc = LoadConfig(model_name=embeddings_request.model)
|
|
812
|
+
|
|
813
|
+
# Load the model if it's different from the currently loaded one
|
|
814
|
+
await self.load_llm(lc)
|
|
815
|
+
|
|
816
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
817
|
+
try:
|
|
818
|
+
return llamacpp.embeddings(embeddings_request, self.llama_telemetry)
|
|
819
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
820
|
+
# Check if model has embeddings label
|
|
821
|
+
model_info = ModelManager().supported_models.get(
|
|
822
|
+
self.llm_loaded.model_name, {}
|
|
823
|
+
)
|
|
824
|
+
if "embeddings" not in model_info.get("labels", []):
|
|
825
|
+
raise HTTPException(
|
|
826
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
827
|
+
detail="You tried to generate embeddings for a model that is "
|
|
828
|
+
"not labeled as an embeddings model. Please use another model "
|
|
829
|
+
"or re-register the current model with the 'embeddings' label.",
|
|
830
|
+
) from e
|
|
831
|
+
else:
|
|
832
|
+
raise e
|
|
833
|
+
else:
|
|
834
|
+
raise HTTPException(
|
|
835
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
836
|
+
detail=f"Embeddings not supported for recipe: {self.llm_loaded.recipe}",
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
async def reranking(self, reranking_request: RerankingRequest):
|
|
840
|
+
"""
|
|
841
|
+
Rerank documents based on their relevance to a query using the llamacpp server.
|
|
842
|
+
"""
|
|
843
|
+
# Initialize load config from reranking request
|
|
844
|
+
lc = LoadConfig(model_name=reranking_request.model)
|
|
845
|
+
|
|
846
|
+
# Load the model if it's different from the currently loaded one
|
|
847
|
+
await self.load_llm(lc)
|
|
848
|
+
|
|
849
|
+
if self.llm_loaded.recipe == "llamacpp":
|
|
850
|
+
try:
|
|
851
|
+
return llamacpp.reranking(reranking_request, self.llama_telemetry)
|
|
852
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
853
|
+
# Check if model has reranking label
|
|
854
|
+
model_info = ModelManager().supported_models.get(
|
|
855
|
+
self.llm_loaded.model_name, {}
|
|
856
|
+
)
|
|
857
|
+
if "reranking" not in model_info.get("labels", []):
|
|
858
|
+
raise HTTPException(
|
|
859
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
860
|
+
detail="You tried to use reranking for a model that is "
|
|
861
|
+
"not labeled as a reranking model. Please use another model "
|
|
862
|
+
"or re-register the current model with the 'reranking' label.",
|
|
863
|
+
) from e
|
|
864
|
+
else:
|
|
865
|
+
raise e
|
|
866
|
+
else:
|
|
867
|
+
raise HTTPException(
|
|
868
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
869
|
+
detail=f"Reranking not supported for recipe: {self.llm_loaded.recipe}",
|
|
870
|
+
)
|
|
871
|
+
|
|
799
872
|
def apply_chat_template(
|
|
800
873
|
self, messages: list[dict], tools: list[dict] | None = None
|
|
801
874
|
):
|