PyPI - llama-cpp-pydist - Versions diffs - 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl - Mend

llama-cpp-pydist 0.19.0py3-none-any.whl → 0.21.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py CHANGED Viewed

@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
 @pytest.mark.parametrize(
     "n_batch,batch_count,reuse_cache",
     [
-        (64, 3, False),
-        (64, 1, True),
+        (64, 4, False),
+        (64, 2, True),
     ]
 )
 def test_return_progress(n_batch, batch_count, reuse_cache):
@@ -462,10 +462,18 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
     res = make_cmpl_request()
     last_progress = None
     total_batch_count = 0
     for data in res:
         cur_progress = data.get("prompt_progress", None)
         if cur_progress is None:
             continue
+        if total_batch_count == 0:
+            # first progress report must have n_cache == n_processed
+            assert cur_progress["total"] > 0
+            assert cur_progress["cache"] == cur_progress["processed"]
+            if reuse_cache:
+                # when reusing cache, we expect some cached tokens
+                assert cur_progress["cache"] > 0
         if last_progress is not None:
             assert cur_progress["total"] == last_progress["total"]
             assert cur_progress["cache"] == last_progress["cache"]
@@ -473,6 +481,7 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
         total_batch_count += 1
         last_progress = cur_progress
+    # last progress should indicate completion (all tokens processed)
     assert last_progress is not None
     assert last_progress["total"] > 0
     assert last_progress["processed"] == last_progress["total"]

vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py ADDED Viewed

@@ -0,0 +1,39 @@
+import pytest
+import time
+from utils import *
+server = ServerPreset.tinyllama2()
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+def test_server_sleep():
+    global server
+    server.sleep_idle_seconds = 1
+    server.start()
+    # wait a bit so that server can go to sleep
+    time.sleep(2)
+    # make sure these endpoints are still responsive after sleep
+    res = server.make_request("GET", "/health")
+    assert res.status_code == 200
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["is_sleeping"] == True
+    # make a generation request to wake up the server
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 1,
+        "prompt": "Hello",
+    })
+    assert res.status_code == 200
+    # it should no longer be sleeping
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["is_sleeping"] == False

vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py CHANGED Viewed

@@ -100,6 +100,7 @@ class ServerProcess:
     server_path: str | None = None
     mmproj_url: str | None = None
     media_path: str | None = None
+    sleep_idle_seconds: int | None = None
     # session variables
     process: subprocess.Popen | None = None
@@ -230,6 +231,8 @@ class ServerProcess:
             server_args.extend(["--mmproj-url", self.mmproj_url])
         if self.media_path:
             server_args.extend(["--media-path", self.media_path])
+        if self.sleep_idle_seconds is not None:
+            server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
         args = [str(arg) for arg in [server_path, *server_args]]
         print(f"tests: starting server with: {' '.join(args)}")

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte CHANGED Viewed

@@ -89,6 +89,7 @@
 	const fallbackToolCalls = $derived(typeof toolCallContent === 'string' ? toolCallContent : null);
 	const processingState = useProcessingState();
 	let currentConfig = $derived(config());
 	let isRouter = $derived(isRouterMode());
 	let displayedModel = $derived((): string | null => {
@@ -116,6 +117,12 @@
 		}
 	});
+	$effect(() => {
+		if (isLoading() && !message?.content?.trim()) {
+			processingState.startMonitoring();
+		}
+	});
 	function formatToolCallBadge(toolCall: ApiChatCompletionToolCall, index: number) {
 		const callNumber = index + 1;
 		const functionName = toolCall.function?.name?.trim();
@@ -186,7 +193,7 @@
 		<div class="mt-6 w-full max-w-[48rem]" in:fade>
 			<div class="processing-container">
 				<span class="processing-text">
-					{processingState.getProcessingMessage()}
+					{processingState.getPromptProgressText() ?? processingState.getProcessingMessage()}
 				</span>
 			</div>
 		</div>
@@ -263,6 +270,23 @@
 						predictedTokens={message.timings.predicted_n}
 						predictedMs={message.timings.predicted_ms}
 					/>
+				{:else if isLoading() && currentConfig.showMessageStats}
+					{@const liveStats = processingState.getLiveProcessingStats()}
+					{@const genStats = processingState.getLiveGenerationStats()}
+					{@const promptProgress = processingState.processingState?.promptProgress}
+					{@const isStillProcessingPrompt =
+						promptProgress && promptProgress.processed < promptProgress.total}
+					{#if liveStats || genStats}
+						<ChatMessageStatistics
+							isLive={true}
+							isProcessingPrompt={!!isStillProcessingPrompt}
+							promptTokens={liveStats?.tokensProcessed}
+							promptMs={liveStats?.timeMs}
+							predictedTokens={genStats?.tokensGenerated}
+							predictedMs={genStats?.timeMs}
+						/>
+					{/if}
 				{/if}
 			</div>
 		{/if}

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte CHANGED Viewed

@@ -5,21 +5,64 @@
 	import { ChatMessageStatsView } from '$lib/enums';
 	interface Props {
-		predictedTokens: number;
-		predictedMs: number;
+		predictedTokens?: number;
+		predictedMs?: number;
 		promptTokens?: number;
 		promptMs?: number;
+		// Live mode: when true, shows stats during streaming
+		isLive?: boolean;
+		// Whether prompt processing is still in progress
+		isProcessingPrompt?: boolean;
+		// Initial view to show (defaults to READING in live mode)
+		initialView?: ChatMessageStatsView;
 	}
-	let { predictedTokens, predictedMs, promptTokens, promptMs }: Props = $props();
+	let {
+		predictedTokens,
+		predictedMs,
+		promptTokens,
+		promptMs,
+		isLive = false,
+		isProcessingPrompt = false,
+		initialView = ChatMessageStatsView.GENERATION
+	}: Props = $props();
-	let activeView: ChatMessageStatsView = $state(ChatMessageStatsView.GENERATION);
+	let activeView: ChatMessageStatsView = $state(initialView);
+	let hasAutoSwitchedToGeneration = $state(false);
-	let tokensPerSecond = $derived((predictedTokens / predictedMs) * 1000);
-	let timeInSeconds = $derived((predictedMs / 1000).toFixed(2));
+	// In live mode: auto-switch to GENERATION tab when prompt processing completes
+	$effect(() => {
+		if (isLive) {
+			// Auto-switch to generation tab only when prompt processing is done (once)
+			if (
+				!hasAutoSwitchedToGeneration &&
+				!isProcessingPrompt &&
+				predictedTokens &&
+				predictedTokens > 0
+			) {
+				activeView = ChatMessageStatsView.GENERATION;
+				hasAutoSwitchedToGeneration = true;
+			} else if (!hasAutoSwitchedToGeneration) {
+				// Stay on READING while prompt is still being processed
+				activeView = ChatMessageStatsView.READING;
+			}
+		}
+	});
+	let hasGenerationStats = $derived(
+		predictedTokens !== undefined &&
+			predictedTokens > 0 &&
+			predictedMs !== undefined &&
+			predictedMs > 0
+	);
+	let tokensPerSecond = $derived(hasGenerationStats ? (predictedTokens! / predictedMs!) * 1000 : 0);
+	let timeInSeconds = $derived(
+		predictedMs !== undefined ? (predictedMs / 1000).toFixed(2) : '0.00'
+	);
 	let promptTokensPerSecond = $derived(
-		promptTokens !== undefined && promptMs !== undefined
+		promptTokens !== undefined && promptMs !== undefined && promptMs > 0
 			? (promptTokens / promptMs) * 1000
 			: undefined
 	);
@@ -34,11 +77,14 @@
 			promptTokensPerSecond !== undefined &&
 			promptTimeInSeconds !== undefined
 	);
+	// In live mode, generation tab is disabled until we have generation stats
+	let isGenerationDisabled = $derived(isLive && !hasGenerationStats);
 </script>
 <div class="inline-flex items-center text-xs text-muted-foreground">
 	<div class="inline-flex items-center rounded-sm bg-muted-foreground/15 p-0.5">
-		{#if hasPromptStats}
+		{#if hasPromptStats || isLive}
 			<Tooltip.Root>
 				<Tooltip.Trigger>
 					<button
@@ -65,25 +111,32 @@
 					class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
 					ChatMessageStatsView.GENERATION
 						? 'bg-background text-foreground shadow-sm'
-						: 'hover:text-foreground'}"
-					onclick={() => (activeView = ChatMessageStatsView.GENERATION)}
+						: isGenerationDisabled
+							? 'cursor-not-allowed opacity-40'
+							: 'hover:text-foreground'}"
+					onclick={() => !isGenerationDisabled && (activeView = ChatMessageStatsView.GENERATION)}
+					disabled={isGenerationDisabled}
 				>
 					<Sparkles class="h-3 w-3" />
 					<span class="sr-only">Generation</span>
 				</button>
 			</Tooltip.Trigger>
 			<Tooltip.Content>
-				<p>Generation (token output)</p>
+				<p>
+					{isGenerationDisabled
+						? 'Generation (waiting for tokens...)'
+						: 'Generation (token output)'}
+				</p>
 			</Tooltip.Content>
 		</Tooltip.Root>
 	</div>
 	<div class="flex items-center gap-1 px-2">
-		{#if activeView === ChatMessageStatsView.GENERATION}
+		{#if activeView === ChatMessageStatsView.GENERATION && hasGenerationStats}
 			<BadgeChatStatistic
 				class="bg-transparent"
 				icon={WholeWord}
-				value="{predictedTokens} tokens"
+				value="{predictedTokens?.toLocaleString()} tokens"
 				tooltipLabel="Generated tokens"
 			/>
 			<BadgeChatStatistic

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte CHANGED Viewed

@@ -185,6 +185,11 @@
 					key: 'samplers',
 					label: 'Samplers',
 					type: 'input'
+				},
+				{
+					key: 'backend_sampling',
+					label: 'Backend sampling',
+					type: 'checkbox'
 				}
 			]
 		},

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts CHANGED Viewed

@@ -21,6 +21,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
 	autoMicOnEmpty: false,
 	// make sure these default values are in sync with `common.h`
 	samplers: 'top_k;typ_p;top_p;min_p;temperature',
+	backend_sampling: false,
 	temperature: 0.8,
 	dynatemp_range: 0.0,
 	dynatemp_exponent: 1.0,
@@ -57,6 +58,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 		'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
 	samplers:
 		'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
+	backend_sampling:
+		'Enable backend-based samplers. When enabled, supported samplers run on the accelerator backend for faster sampling.',
 	temperature:
 		'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
 	dynatemp_range:

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts CHANGED Viewed

@@ -1,10 +1,27 @@
 import { activeProcessingState } from '$lib/stores/chat.svelte';
 import { config } from '$lib/stores/settings.svelte';
+export interface LiveProcessingStats {
+	tokensProcessed: number;
+	totalTokens: number;
+	timeMs: number;
+	tokensPerSecond: number;
+	etaSecs?: number;
+}
+export interface LiveGenerationStats {
+	tokensGenerated: number;
+	timeMs: number;
+	tokensPerSecond: number;
+}
 export interface UseProcessingStateReturn {
 	readonly processingState: ApiProcessingState | null;
 	getProcessingDetails(): string[];
 	getProcessingMessage(): string;
+	getPromptProgressText(): string | null;
+	getLiveProcessingStats(): LiveProcessingStats | null;
+	getLiveGenerationStats(): LiveGenerationStats | null;
 	shouldShowDetails(): boolean;
 	startMonitoring(): void;
 	stopMonitoring(): void;
@@ -29,6 +46,7 @@ export interface UseProcessingStateReturn {
 export function useProcessingState(): UseProcessingStateReturn {
 	let isMonitoring = $state(false);
 	let lastKnownState = $state<ApiProcessingState | null>(null);
+	let lastKnownProcessingStats = $state<LiveProcessingStats | null>(null);
 	// Derive processing state reactively from chatStore's direct state
 	const processingState = $derived.by(() => {
@@ -46,6 +64,34 @@ export function useProcessingState(): UseProcessingStateReturn {
 		}
 	});
+	// Track last known processing stats for when promptProgress disappears
+	$effect(() => {
+		if (processingState?.promptProgress) {
+			const { processed, total, time_ms, cache } = processingState.promptProgress;
+			const actualProcessed = processed - cache;
+			const actualTotal = total - cache;
+			if (actualProcessed > 0 && time_ms > 0) {
+				const tokensPerSecond = actualProcessed / (time_ms / 1000);
+				lastKnownProcessingStats = {
+					tokensProcessed: actualProcessed,
+					totalTokens: actualTotal,
+					timeMs: time_ms,
+					tokensPerSecond
+				};
+			}
+		}
+	});
+	function getETASecs(done: number, total: number, elapsedMs: number): number | undefined {
+		const elapsedSecs = elapsedMs / 1000;
+		const progressETASecs =
+			done === 0 || elapsedSecs < 0.5
+				? undefined // can be the case for the 0% progress report
+				: elapsedSecs * (total / done - 1);
+		return progressETASecs;
+	}
 	function startMonitoring(): void {
 		if (isMonitoring) return;
 		isMonitoring = true;
@@ -59,28 +105,25 @@ export function useProcessingState(): UseProcessingStateReturn {
 		const currentConfig = config();
 		if (!currentConfig.keepStatsVisible) {
 			lastKnownState = null;
+			lastKnownProcessingStats = null;
 		}
 	}
 	function getProcessingMessage(): string {
-		const state = processingState;
-		if (!state) {
+		if (!processingState) {
 			return 'Processing...';
 		}
-		switch (state.status) {
+		switch (processingState.status) {
 			case 'initializing':
 				return 'Initializing...';
 			case 'preparing':
-				if (state.progressPercent !== undefined) {
-					return `Processing (${state.progressPercent}%)`;
+				if (processingState.progressPercent !== undefined) {
+					return `Processing (${processingState.progressPercent}%)`;
 				}
 				return 'Preparing response...';
 			case 'generating':
-				if (state.tokensDecoded > 0) {
-					return `Generating... (${state.tokensDecoded} tokens)`;
-				}
-				return 'Generating...';
+				return '';
 			default:
 				return 'Processing...';
 		}
@@ -131,8 +174,76 @@ export function useProcessingState(): UseProcessingStateReturn {
 	}
 	function shouldShowDetails(): boolean {
-		const state = processingState;
-		return state !== null && state.status !== 'idle';
+		return processingState !== null && processingState.status !== 'idle';
+	}
+	/**
+	 * Returns a short progress message with percent
+	 */
+	function getPromptProgressText(): string | null {
+		if (!processingState?.promptProgress) return null;
+		const { processed, total, cache } = processingState.promptProgress;
+		const actualProcessed = processed - cache;
+		const actualTotal = total - cache;
+		const percent = Math.round((actualProcessed / actualTotal) * 100);
+		const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms);
+		if (eta !== undefined) {
+			const etaSecs = Math.ceil(eta);
+			return `Processing ${percent}% (ETA: ${etaSecs}s)`;
+		}
+		return `Processing ${percent}%`;
+	}
+	/**
+	 * Returns live processing statistics for display (prompt processing phase)
+	 * Returns last known stats when promptProgress becomes unavailable
+	 */
+	function getLiveProcessingStats(): LiveProcessingStats | null {
+		if (processingState?.promptProgress) {
+			const { processed, total, time_ms, cache } = processingState.promptProgress;
+			const actualProcessed = processed - cache;
+			const actualTotal = total - cache;
+			if (actualProcessed > 0 && time_ms > 0) {
+				const tokensPerSecond = actualProcessed / (time_ms / 1000);
+				return {
+					tokensProcessed: actualProcessed,
+					totalTokens: actualTotal,
+					timeMs: time_ms,
+					tokensPerSecond
+				};
+			}
+		}
+		// Return last known stats if promptProgress is no longer available
+		return lastKnownProcessingStats;
+	}
+	/**
+	 * Returns live generation statistics for display (token generation phase)
+	 */
+	function getLiveGenerationStats(): LiveGenerationStats | null {
+		if (!processingState) return null;
+		const { tokensDecoded, tokensPerSecond } = processingState;
+		if (tokensDecoded <= 0) return null;
+		// Calculate time from tokens and speed
+		const timeMs =
+			tokensPerSecond && tokensPerSecond > 0 ? (tokensDecoded / tokensPerSecond) * 1000 : 0;
+		return {
+			tokensGenerated: tokensDecoded,
+			timeMs,
+			tokensPerSecond: tokensPerSecond || 0
+		};
 	}
 	return {
@@ -141,6 +252,9 @@ export function useProcessingState(): UseProcessingStateReturn {
 		},
 		getProcessingDetails,
 		getProcessingMessage,
+		getPromptProgressText,
+		getLiveProcessingStats,
+		getLiveGenerationStats,
 		shouldShowDetails,
 		startMonitoring,
 		stopMonitoring

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts CHANGED Viewed

@@ -86,6 +86,7 @@ export class ChatService {
 			dry_penalty_last_n,
 			// Other parameters
 			samplers,
+			backend_sampling,
 			custom,
 			timings_per_token,
 			// Config options
@@ -117,7 +118,8 @@ export class ChatService {
 				role: msg.role,
 				content: msg.content
 			})),
-			stream
+			stream,
+			return_progress: stream ? true : undefined
 		};
 		// Include model in request if provided (required in ROUTER mode)
@@ -158,6 +160,8 @@ export class ChatService {
 					: samplers;
 		}
+		if (backend_sampling !== undefined) requestBody.backend_sampling = backend_sampling;
 		if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
 		if (custom) {
@@ -271,7 +275,7 @@ export class ChatService {
 		onReasoningChunk?: (chunk: string) => void,
 		onToolCallChunk?: (chunk: string) => void,
 		onModel?: (model: string) => void,
-		onTimings?: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
+		onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
 		conversationId?: string,
 		abortSignal?: AbortSignal
 	): Promise<void> {
@@ -366,11 +370,13 @@ export class ChatService {
 								onModel?.(chunkModel);
 							}
-							if (timings || promptProgress) {
+							if (promptProgress) {
+								ChatService.notifyTimings(undefined, promptProgress, onTimings);
+							}
+							if (timings) {
 								ChatService.notifyTimings(timings, promptProgress, onTimings);
-								if (timings) {
-									lastTimings = timings;
-								}
+								lastTimings = timings;
 							}
 							if (content) {
@@ -768,10 +774,11 @@ export class ChatService {
 		timings: ChatMessageTimings | undefined,
 		promptProgress: ChatMessagePromptProgress | undefined,
 		onTimingsCallback:
-			| ((timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
+			| ((timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
 			| undefined
 	): void {
-		if (!timings || !onTimingsCallback) return;
+		if (!onTimingsCallback || (!timings && !promptProgress)) return;
 		onTimingsCallback(timings, promptProgress);
 	}
 }

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts CHANGED Viewed

@@ -303,11 +303,17 @@ class ChatStore {
 		const currentConfig = config();
 		const outputTokensMax = currentConfig.max_tokens || -1;
+		// Note: for timings data, the n_prompt does NOT include cache tokens
 		const contextUsed = promptTokens + cacheTokens + predictedTokens;
 		const outputTokensUsed = predictedTokens;
+		// Note: for prompt progress, the "processed" DOES include cache tokens
+		// we need to exclude them to get the real prompt tokens processed count
+		const progressCache = promptProgress?.cache || 0;
+		const progressActualDone = (promptProgress?.processed ?? 0) - progressCache;
+		const progressActualTotal = (promptProgress?.total ?? 0) - progressCache;
 		const progressPercent = promptProgress
-			? Math.round((promptProgress.processed / promptProgress.total) * 100)
+			? Math.round((progressActualDone / progressActualTotal) * 100)
 			: undefined;
 		return {
@@ -324,6 +330,7 @@ class ChatStore {
 			topP: currentConfig.top_p ?? 0.95,
 			speculative: false,
 			progressPercent,
+			promptProgress,
 			promptTokens,
 			promptMs,
 			cacheTokens
@@ -534,7 +541,7 @@ class ChatStore {
 					conversationsStore.updateMessageAtIndex(idx, { toolCalls: streamedToolCallContent });
 				},
 				onModel: (modelName: string) => recordModel(modelName),
-				onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+				onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
 					const tokensPerSecond =
 						timings?.predicted_ms && timings?.predicted_n
 							? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1032,7 +1039,7 @@ class ChatStore {
 						});
 					},
-					onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+					onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
 						const tokensPerSecond =
 							timings?.predicted_ms && timings?.predicted_n
 								? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1454,6 +1461,8 @@ class ChatStore {
 		if (hasValue(currentConfig.dry_penalty_last_n))
 			apiOptions.dry_penalty_last_n = Number(currentConfig.dry_penalty_last_n);
 		if (currentConfig.samplers) apiOptions.samplers = currentConfig.samplers;
+		if (currentConfig.backend_sampling)
+			apiOptions.backend_sampling = currentConfig.backend_sampling;
 		if (currentConfig.custom) apiOptions.custom = currentConfig.custom;
 		return apiOptions;

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts CHANGED Viewed

@@ -294,15 +294,14 @@ class SettingsStore {
 	 * This sets up the default values from /props endpoint
 	 */
 	syncWithServerDefaults(): void {
-		const serverParams = serverStore.defaultParams;
-		if (!serverParams) {
-			console.warn('No server parameters available for initialization');
+		const propsDefaults = this.getServerDefaults();
+		if (Object.keys(propsDefaults).length === 0) {
+			console.warn('No server defaults available for initialization');
 			return;
 		}
-		const propsDefaults = this.getServerDefaults();
 		for (const [key, propsValue] of Object.entries(propsDefaults)) {
 			const currentValue = getConfigValue(this.config, key);

vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts CHANGED Viewed

@@ -149,6 +149,7 @@ export interface ApiLlamaCppServerProps {
 			reasoning_in_content: boolean;
 			thinking_forced_open: boolean;
 			samplers: string[];
+			backend_sampling: boolean;
 			'speculative.n_max': number;
 			'speculative.n_min': number;
 			'speculative.p_min': number;
@@ -186,6 +187,7 @@ export interface ApiChatCompletionRequest {
 	}>;
 	stream?: boolean;
 	model?: string;
+	return_progress?: boolean;
 	// Reasoning parameters
 	reasoning_format?: string;
 	// Generation parameters
@@ -211,6 +213,7 @@ export interface ApiChatCompletionRequest {
 	dry_penalty_last_n?: number;
 	// Sampler configuration
 	samplers?: string[];
+	backend_sampling?: boolean;
 	// Custom parameters (JSON string)
 	custom?: Record<string, unknown>;
 	timings_per_token?: boolean;
@@ -311,6 +314,7 @@ export interface ApiSlotData {
 		reasoning_in_content: boolean;
 		thinking_forced_open: boolean;
 		samplers: string[];
+		backend_sampling: boolean;
 		'speculative.n_max': number;
 		'speculative.n_min': number;
 		'speculative.p_min': number;
@@ -341,6 +345,7 @@ export interface ApiProcessingState {
 	tokensPerSecond?: number;
 	// Progress information from prompt_progress
 	progressPercent?: number;
+	promptProgress?: ChatMessagePromptProgress;
 	promptTokens?: number;
 	promptMs?: number;
 	cacheTokens?: number;

llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

llama-cpp-pydist 0.19.0py3-none-any.whl → 0.21.0py3-none-any.whl