lemonade-sdk 8.1.2__py3-none-any.whl → 8.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/tools/oga/utils.py +54 -33
- lemonade/tools/server/llamacpp.py +96 -4
- lemonade/tools/server/serve.py +74 -8
- lemonade/tools/server/static/js/chat.js +735 -0
- lemonade/tools/server/static/js/model-settings.js +162 -0
- lemonade/tools/server/static/js/models.js +865 -0
- lemonade/tools/server/static/js/shared.js +491 -0
- lemonade/tools/server/static/styles.css +652 -26
- lemonade/tools/server/static/webapp.html +145 -1092
- lemonade/tools/server/utils/port.py +3 -2
- lemonade/version.py +1 -1
- {lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/METADATA +7 -6
- {lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/RECORD +21 -17
- lemonade_server/cli.py +31 -17
- lemonade_server/pydantic_models.py +15 -3
- lemonade_server/server_models.json +9 -3
- {lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/top_level.txt +0 -0
lemonade/tools/oga/utils.py
CHANGED
|
@@ -100,9 +100,10 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
100
100
|
max_new_tokens=512,
|
|
101
101
|
min_new_tokens=0,
|
|
102
102
|
do_sample=True,
|
|
103
|
-
top_k=
|
|
104
|
-
top_p=
|
|
105
|
-
temperature=
|
|
103
|
+
top_k=None,
|
|
104
|
+
top_p=None,
|
|
105
|
+
temperature=None,
|
|
106
|
+
repeat_penalty=None,
|
|
106
107
|
streamer: OrtGenaiStreamer = None,
|
|
107
108
|
pad_token_id=None,
|
|
108
109
|
stopping_criteria=None,
|
|
@@ -154,38 +155,58 @@ class OrtGenaiModel(ModelAdapter):
|
|
|
154
155
|
if random_seed is None:
|
|
155
156
|
random_seed = -1 # In og.Generator, -1 = seed with random device
|
|
156
157
|
|
|
158
|
+
# Get search config if available, otherwise use empty dict
|
|
159
|
+
# Thanks to the empty dict, if the model doesn't have a built-in search
|
|
160
|
+
# config, the .get() calls will all just use the default values
|
|
161
|
+
search_config = {}
|
|
157
162
|
if self.config and "search" in self.config:
|
|
158
163
|
search_config = self.config["search"]
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
164
|
+
|
|
165
|
+
# Apply parameter hierarchy: user provided > search config > defaults
|
|
166
|
+
default_top_k = 50
|
|
167
|
+
default_top_p = 1.0
|
|
168
|
+
default_temperature = 0.7
|
|
169
|
+
default_repetition_penalty = 1.0
|
|
170
|
+
|
|
171
|
+
top_k_to_use = (
|
|
172
|
+
top_k if top_k is not None else search_config.get("top_k", default_top_k)
|
|
173
|
+
)
|
|
174
|
+
top_p_to_use = (
|
|
175
|
+
top_p if top_p is not None else search_config.get("top_p", default_top_p)
|
|
176
|
+
)
|
|
177
|
+
temperature_to_use = (
|
|
178
|
+
temperature
|
|
179
|
+
if temperature is not None
|
|
180
|
+
else search_config.get("temperature", default_temperature)
|
|
181
|
+
)
|
|
182
|
+
# Map the llamacpp name, `repeat_penalty`, to the OGA name, `repetition_penalty`
|
|
183
|
+
repetition_penalty_to_use = (
|
|
184
|
+
repeat_penalty
|
|
185
|
+
if repeat_penalty is not None
|
|
186
|
+
else search_config.get("repetition_penalty", default_repetition_penalty)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Set search options once with all parameters
|
|
190
|
+
params.set_search_options(
|
|
191
|
+
do_sample=search_config.get("do_sample", do_sample),
|
|
192
|
+
top_k=top_k_to_use,
|
|
193
|
+
top_p=top_p_to_use,
|
|
194
|
+
temperature=temperature_to_use,
|
|
195
|
+
repetition_penalty=repetition_penalty_to_use,
|
|
196
|
+
max_length=max_length_to_use,
|
|
197
|
+
min_length=min_length,
|
|
198
|
+
early_stopping=search_config.get("early_stopping", False),
|
|
199
|
+
length_penalty=search_config.get("length_penalty", 1.0),
|
|
200
|
+
num_beams=search_config.get("num_beams", 1),
|
|
201
|
+
num_return_sequences=search_config.get("num_return_sequences", 1),
|
|
202
|
+
past_present_share_buffer=search_config.get(
|
|
203
|
+
"past_present_share_buffer", True
|
|
204
|
+
),
|
|
205
|
+
random_seed=random_seed,
|
|
206
|
+
# Not currently supported by OGA
|
|
207
|
+
# diversity_penalty=search_config.get('diversity_penalty', 0.0),
|
|
208
|
+
# no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
|
|
209
|
+
)
|
|
189
210
|
params.try_graph_capture_with_max_batch_size(1)
|
|
190
211
|
|
|
191
212
|
generator = og.Generator(self.model, params)
|
|
@@ -43,6 +43,72 @@ def llamacpp_address(port: int) -> str:
|
|
|
43
43
|
return f"http://127.0.0.1:{port}/v1"
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
|
|
47
|
+
"""
|
|
48
|
+
Separate standard OpenAI parameters from custom llama.cpp parameters.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
request_dict: Dictionary of all request parameters
|
|
52
|
+
endpoint_type: Type of endpoint ("chat" or "completion")
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Dictionary with parameters properly separated for OpenAI client
|
|
56
|
+
"""
|
|
57
|
+
openai_client_params = {}
|
|
58
|
+
extra_params = {}
|
|
59
|
+
|
|
60
|
+
# Common OpenAI parameters for both endpoint types
|
|
61
|
+
common_params = {
|
|
62
|
+
"model",
|
|
63
|
+
"frequency_penalty",
|
|
64
|
+
"logit_bias",
|
|
65
|
+
"logprobs",
|
|
66
|
+
"max_tokens",
|
|
67
|
+
"n",
|
|
68
|
+
"presence_penalty",
|
|
69
|
+
"seed",
|
|
70
|
+
"stop",
|
|
71
|
+
"stream",
|
|
72
|
+
"temperature",
|
|
73
|
+
"top_p",
|
|
74
|
+
"user",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Standard OpenAI parameters by endpoint type
|
|
78
|
+
if endpoint_type == "chat":
|
|
79
|
+
chat_specific_params = {
|
|
80
|
+
"messages",
|
|
81
|
+
"top_logprobs",
|
|
82
|
+
"response_format",
|
|
83
|
+
"service_tier",
|
|
84
|
+
"stream_options",
|
|
85
|
+
"tools",
|
|
86
|
+
"tool_choice",
|
|
87
|
+
"parallel_tool_calls",
|
|
88
|
+
}
|
|
89
|
+
openai_params = common_params | chat_specific_params
|
|
90
|
+
else: # completion
|
|
91
|
+
completion_specific_params = {
|
|
92
|
+
"prompt",
|
|
93
|
+
"best_of",
|
|
94
|
+
"echo",
|
|
95
|
+
"suffix",
|
|
96
|
+
}
|
|
97
|
+
openai_params = common_params | completion_specific_params
|
|
98
|
+
|
|
99
|
+
for key, value in request_dict.items():
|
|
100
|
+
if key in openai_params:
|
|
101
|
+
openai_client_params[key] = value
|
|
102
|
+
else:
|
|
103
|
+
extra_params[key] = value
|
|
104
|
+
|
|
105
|
+
# If there are custom parameters, use extra_body to pass them through
|
|
106
|
+
if extra_params:
|
|
107
|
+
openai_client_params["extra_body"] = extra_params
|
|
108
|
+
|
|
109
|
+
return openai_client_params
|
|
110
|
+
|
|
111
|
+
|
|
46
112
|
class LlamaTelemetry:
|
|
47
113
|
"""
|
|
48
114
|
Manages telemetry data collection and display for llama server.
|
|
@@ -226,6 +292,11 @@ def _launch_llama_subprocess(
|
|
|
226
292
|
"--ctx-size",
|
|
227
293
|
str(ctx_size),
|
|
228
294
|
]
|
|
295
|
+
|
|
296
|
+
# Lock random seed for deterministic behavior in CI
|
|
297
|
+
if os.environ.get("LEMONADE_CI_MODE"):
|
|
298
|
+
base_command.extend(["--seed", "42"])
|
|
299
|
+
|
|
229
300
|
if "mmproj" in snapshot_files:
|
|
230
301
|
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
231
302
|
if not use_gpu:
|
|
@@ -238,6 +309,15 @@ def _launch_llama_subprocess(
|
|
|
238
309
|
# Add port and jinja to enable tool use
|
|
239
310
|
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
240
311
|
|
|
312
|
+
# Disable jinja for gpt-oss-120b on Vulkan
|
|
313
|
+
if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
|
|
314
|
+
base_command.remove("--jinja")
|
|
315
|
+
logging.warning(
|
|
316
|
+
"Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
|
|
317
|
+
"(see https://github.com/ggml-org/llama.cpp/issues/15274). "
|
|
318
|
+
"The model cannot use tools. If needed, use the ROCm backend instead."
|
|
319
|
+
)
|
|
320
|
+
|
|
241
321
|
# Use legacy reasoning formatting, since not all apps support the new
|
|
242
322
|
# reasoning_content field
|
|
243
323
|
base_command.extend(["--reasoning-format", "none"])
|
|
@@ -384,13 +464,17 @@ def chat_completion(
|
|
|
384
464
|
exclude_unset=True, exclude_none=True
|
|
385
465
|
)
|
|
386
466
|
|
|
467
|
+
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
468
|
+
openai_client_params = _separate_openai_params(request_dict, "chat")
|
|
469
|
+
|
|
387
470
|
# Check if streaming is requested
|
|
388
471
|
if chat_completion_request.stream:
|
|
389
472
|
|
|
390
473
|
def event_stream():
|
|
391
474
|
try:
|
|
392
475
|
# Enable streaming
|
|
393
|
-
|
|
476
|
+
# pylint: disable=missing-kwoa
|
|
477
|
+
for chunk in client.chat.completions.create(**openai_client_params):
|
|
394
478
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
395
479
|
yield "data: [DONE]\n\n"
|
|
396
480
|
|
|
@@ -412,7 +496,8 @@ def chat_completion(
|
|
|
412
496
|
# Non-streaming response
|
|
413
497
|
try:
|
|
414
498
|
# Disable streaming for non-streaming requests
|
|
415
|
-
|
|
499
|
+
# pylint: disable=missing-kwoa
|
|
500
|
+
response = client.chat.completions.create(**openai_client_params)
|
|
416
501
|
|
|
417
502
|
# Show telemetry after completion
|
|
418
503
|
telemetry.show_telemetry()
|
|
@@ -420,6 +505,7 @@ def chat_completion(
|
|
|
420
505
|
return response
|
|
421
506
|
|
|
422
507
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
508
|
+
logging.error("Error during chat completion: %s", str(e))
|
|
423
509
|
raise HTTPException(
|
|
424
510
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
425
511
|
detail=f"Chat completion error: {str(e)}",
|
|
@@ -446,13 +532,17 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
|
|
|
446
532
|
# Convert Pydantic model to dict and remove unset/null values
|
|
447
533
|
request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
|
|
448
534
|
|
|
535
|
+
# Separate standard OpenAI parameters from custom llama.cpp parameters
|
|
536
|
+
openai_client_params = _separate_openai_params(request_dict, "completion")
|
|
537
|
+
|
|
449
538
|
# Check if streaming is requested
|
|
450
539
|
if completion_request.stream:
|
|
451
540
|
|
|
452
541
|
def event_stream():
|
|
453
542
|
try:
|
|
454
543
|
# Enable streaming
|
|
455
|
-
|
|
544
|
+
# pylint: disable=missing-kwoa
|
|
545
|
+
for chunk in client.completions.create(**openai_client_params):
|
|
456
546
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
|
457
547
|
yield "data: [DONE]\n\n"
|
|
458
548
|
|
|
@@ -474,7 +564,8 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
|
|
|
474
564
|
# Non-streaming response
|
|
475
565
|
try:
|
|
476
566
|
# Disable streaming for non-streaming requests
|
|
477
|
-
|
|
567
|
+
# pylint: disable=missing-kwoa
|
|
568
|
+
response = client.completions.create(**openai_client_params)
|
|
478
569
|
|
|
479
570
|
# Show telemetry after completion
|
|
480
571
|
telemetry.show_telemetry()
|
|
@@ -482,6 +573,7 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
|
|
|
482
573
|
return response
|
|
483
574
|
|
|
484
575
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
576
|
+
logging.error("Error during completion: %s", str(e))
|
|
485
577
|
raise HTTPException(
|
|
486
578
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
487
579
|
detail=f"Completion error: {str(e)}",
|
lemonade/tools/server/serve.py
CHANGED
|
@@ -54,7 +54,11 @@ from lemonade.tools.server.utils.port import lifespan
|
|
|
54
54
|
|
|
55
55
|
from lemonade_server.model_manager import ModelManager
|
|
56
56
|
from lemonade_server.pydantic_models import (
|
|
57
|
-
|
|
57
|
+
DEFAULT_PORT,
|
|
58
|
+
DEFAULT_HOST,
|
|
59
|
+
DEFAULT_LOG_LEVEL,
|
|
60
|
+
DEFAULT_LLAMACPP_BACKEND,
|
|
61
|
+
DEFAULT_CTX_SIZE,
|
|
58
62
|
LoadConfig,
|
|
59
63
|
CompletionRequest,
|
|
60
64
|
ChatCompletionRequest,
|
|
@@ -65,19 +69,16 @@ from lemonade_server.pydantic_models import (
|
|
|
65
69
|
DeleteConfig,
|
|
66
70
|
)
|
|
67
71
|
|
|
72
|
+
# Set to a high number to allow for interesting experiences in real apps
|
|
73
|
+
# Tests should use the max_new_tokens argument to set a lower value
|
|
74
|
+
DEFAULT_MAX_NEW_TOKENS = 1500
|
|
75
|
+
|
|
68
76
|
# Only import tray on Windows
|
|
69
77
|
if platform.system() == "Windows":
|
|
70
78
|
# pylint: disable=ungrouped-imports
|
|
71
79
|
from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
|
|
72
80
|
|
|
73
81
|
|
|
74
|
-
DEFAULT_PORT = 8000
|
|
75
|
-
DEFAULT_HOST = "localhost"
|
|
76
|
-
DEFAULT_LOG_LEVEL = "info"
|
|
77
|
-
DEFAULT_LLAMACPP_BACKEND = "vulkan"
|
|
78
|
-
DEFAULT_CTX_SIZE = 4096
|
|
79
|
-
|
|
80
|
-
|
|
81
82
|
class ServerModel(Model):
|
|
82
83
|
"""
|
|
83
84
|
An extension of OpenAI's Model class that adds
|
|
@@ -258,6 +259,47 @@ class Server:
|
|
|
258
259
|
self.app.post(f"{prefix}/reranking")(self.reranking)
|
|
259
260
|
self.app.post(f"{prefix}/rerank")(self.reranking)
|
|
260
261
|
|
|
262
|
+
def _log_request_parameters(self, request, endpoint_name: str):
|
|
263
|
+
"""
|
|
264
|
+
Log request parameters excluding content fields like messages, prompt, or input.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
request: Any request object (CompletionRequest, ChatCompletionRequest, etc.)
|
|
268
|
+
endpoint_name: Name of the endpoint for logging context
|
|
269
|
+
"""
|
|
270
|
+
if not logging.getLogger().isEnabledFor(logging.DEBUG):
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
# Fields to exclude from logging (content fields)
|
|
274
|
+
excluded_fields = {"messages", "prompt", "input"}
|
|
275
|
+
|
|
276
|
+
# Get all attributes from the request object
|
|
277
|
+
request_params = {}
|
|
278
|
+
if hasattr(request, "__dict__"):
|
|
279
|
+
# For pydantic models, get the dict representation
|
|
280
|
+
if hasattr(request, "model_dump"):
|
|
281
|
+
all_params = request.model_dump()
|
|
282
|
+
elif hasattr(request, "dict"):
|
|
283
|
+
all_params = request.dict()
|
|
284
|
+
else:
|
|
285
|
+
all_params = request.__dict__
|
|
286
|
+
|
|
287
|
+
# Filter out excluded fields and add special handling for certain fields
|
|
288
|
+
for key, value in all_params.items():
|
|
289
|
+
if key not in excluded_fields:
|
|
290
|
+
# Special handling for tools field - show count instead of full content
|
|
291
|
+
if key == "tools" and value is not None:
|
|
292
|
+
request_params[key] = (
|
|
293
|
+
f"{len(value)} tools" if isinstance(value, list) else value
|
|
294
|
+
)
|
|
295
|
+
# Special handling for input type in responses
|
|
296
|
+
elif key == "input" and hasattr(request, "input"):
|
|
297
|
+
request_params["input_type"] = type(value).__name__
|
|
298
|
+
else:
|
|
299
|
+
request_params[key] = value
|
|
300
|
+
|
|
301
|
+
logging.debug(f"{endpoint_name} request parameters: {request_params}")
|
|
302
|
+
|
|
261
303
|
def _setup_server_common(
|
|
262
304
|
self,
|
|
263
305
|
tray: bool = False,
|
|
@@ -435,6 +477,9 @@ class Server:
|
|
|
435
477
|
|
|
436
478
|
lc = self.initialize_load_config(completion_request)
|
|
437
479
|
|
|
480
|
+
# Log request parameters (excluding message content for brevity)
|
|
481
|
+
self._log_request_parameters(completion_request, "Completions")
|
|
482
|
+
|
|
438
483
|
# Load the model if it's different from the currently loaded one
|
|
439
484
|
await self.load_llm(lc)
|
|
440
485
|
|
|
@@ -456,6 +501,9 @@ class Server:
|
|
|
456
501
|
"message": text,
|
|
457
502
|
"stop": completion_request.stop,
|
|
458
503
|
"temperature": completion_request.temperature,
|
|
504
|
+
"repeat_penalty": completion_request.repeat_penalty,
|
|
505
|
+
"top_k": completion_request.top_k,
|
|
506
|
+
"top_p": completion_request.top_p,
|
|
459
507
|
"max_new_tokens": completion_request.max_tokens,
|
|
460
508
|
}
|
|
461
509
|
|
|
@@ -564,6 +612,9 @@ class Server:
|
|
|
564
612
|
|
|
565
613
|
lc = self.initialize_load_config(chat_completion_request)
|
|
566
614
|
|
|
615
|
+
# Log request parameters (excluding message history for brevity)
|
|
616
|
+
self._log_request_parameters(chat_completion_request, "Chat completions")
|
|
617
|
+
|
|
567
618
|
# Load the model if it's different from the currently loaded one
|
|
568
619
|
await self.load_llm(lc)
|
|
569
620
|
|
|
@@ -608,6 +659,9 @@ class Server:
|
|
|
608
659
|
"message": text,
|
|
609
660
|
"stop": chat_completion_request.stop,
|
|
610
661
|
"temperature": chat_completion_request.temperature,
|
|
662
|
+
"repeat_penalty": chat_completion_request.repeat_penalty,
|
|
663
|
+
"top_k": chat_completion_request.top_k,
|
|
664
|
+
"top_p": chat_completion_request.top_p,
|
|
611
665
|
"max_new_tokens": max_new_tokens,
|
|
612
666
|
}
|
|
613
667
|
|
|
@@ -856,6 +910,9 @@ class Server:
|
|
|
856
910
|
|
|
857
911
|
lc = self.initialize_load_config(responses_request)
|
|
858
912
|
|
|
913
|
+
# Log request parameters (excluding message history for brevity)
|
|
914
|
+
self._log_request_parameters(responses_request, "Responses")
|
|
915
|
+
|
|
859
916
|
# Load the model if it's different from the currently loaded one
|
|
860
917
|
await self.load_llm(lc)
|
|
861
918
|
|
|
@@ -877,6 +934,9 @@ class Server:
|
|
|
877
934
|
generation_args = {
|
|
878
935
|
"message": text,
|
|
879
936
|
"temperature": responses_request.temperature,
|
|
937
|
+
"repeat_penalty": responses_request.repeat_penalty,
|
|
938
|
+
"top_k": responses_request.top_k,
|
|
939
|
+
"top_p": responses_request.top_p,
|
|
880
940
|
"max_new_tokens": responses_request.max_output_tokens,
|
|
881
941
|
}
|
|
882
942
|
|
|
@@ -1006,6 +1066,9 @@ class Server:
|
|
|
1006
1066
|
stop: list[str] | str | None = None,
|
|
1007
1067
|
max_new_tokens: int | None = None,
|
|
1008
1068
|
temperature: float | None = None,
|
|
1069
|
+
repeat_penalty: float | None = None,
|
|
1070
|
+
top_k: int | None = None,
|
|
1071
|
+
top_p: float | None = None,
|
|
1009
1072
|
):
|
|
1010
1073
|
"""
|
|
1011
1074
|
Core streaming completion logic, separated from response handling.
|
|
@@ -1088,6 +1151,9 @@ class Server:
|
|
|
1088
1151
|
"pad_token_id": tokenizer.eos_token_id,
|
|
1089
1152
|
"stopping_criteria": stopping_criteria,
|
|
1090
1153
|
"temperature": temperature,
|
|
1154
|
+
"repeat_penalty": repeat_penalty,
|
|
1155
|
+
"top_k": top_k,
|
|
1156
|
+
"top_p": top_p,
|
|
1091
1157
|
}
|
|
1092
1158
|
|
|
1093
1159
|
# Initialize performance variables
|