npm - @appkit/llamacpp-cli - Versions diffs - 1.4.1 → 1.5.0 - Mend

@appkit/llamacpp-cli 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/CHANGELOG.md +8 -0
package/README.md +87 -1
package/dist/cli.js +14 -0
package/dist/cli.js.map +1 -1
package/dist/commands/monitor.d.ts +2 -0
package/dist/commands/monitor.d.ts.map +1 -0
package/dist/commands/monitor.js +76 -0
package/dist/commands/monitor.js.map +1 -0
package/dist/lib/metrics-aggregator.d.ts +39 -0
package/dist/lib/metrics-aggregator.d.ts.map +1 -0
package/dist/lib/metrics-aggregator.js +200 -0
package/dist/lib/metrics-aggregator.js.map +1 -0
package/dist/lib/system-collector.d.ts +75 -0
package/dist/lib/system-collector.d.ts.map +1 -0
package/dist/lib/system-collector.js +310 -0
package/dist/lib/system-collector.js.map +1 -0
package/dist/tui/MonitorApp.d.ts +4 -0
package/dist/tui/MonitorApp.d.ts.map +1 -0
package/dist/tui/MonitorApp.js +293 -0
package/dist/tui/MonitorApp.js.map +1 -0
package/dist/tui/MultiServerMonitorApp.d.ts +4 -0
package/dist/tui/MultiServerMonitorApp.d.ts.map +1 -0
package/dist/tui/MultiServerMonitorApp.js +496 -0
package/dist/tui/MultiServerMonitorApp.js.map +1 -0
package/dist/tui/components/ErrorState.d.ts +8 -0
package/dist/tui/components/ErrorState.d.ts.map +1 -0
package/dist/tui/components/ErrorState.js +22 -0
package/dist/tui/components/ErrorState.js.map +1 -0
package/dist/tui/components/LoadingState.d.ts +8 -0
package/dist/tui/components/LoadingState.d.ts.map +1 -0
package/dist/tui/components/LoadingState.js +21 -0
package/dist/tui/components/LoadingState.js.map +1 -0
package/dist/types/monitor-types.d.ts +122 -0
package/dist/types/monitor-types.d.ts.map +1 -0
package/dist/types/monitor-types.js +3 -0
package/dist/types/monitor-types.js.map +1 -0
package/dist/utils/process-utils.d.ts +16 -1
package/dist/utils/process-utils.d.ts.map +1 -1
package/dist/utils/process-utils.js +144 -27
package/dist/utils/process-utils.js.map +1 -1
package/package.json +3 -1
package/src/cli.ts +14 -0
package/src/commands/monitor.ts +90 -0
package/src/lib/metrics-aggregator.ts +244 -0
package/src/lib/system-collector.ts +312 -0
package/src/tui/MonitorApp.ts +361 -0
package/src/tui/MultiServerMonitorApp.ts +547 -0
package/src/types/monitor-types.ts +161 -0
package/src/utils/process-utils.ts +160 -26

package/src/lib/metrics-aggregator.ts ADDED Viewed

@@ -0,0 +1,244 @@
+import { ServerConfig } from '../types/server-config.js';
+import { ServerMetrics, SlotInfo, MonitorData } from '../types/monitor-types.js';
+import { statusChecker } from './status-checker.js';
+import { systemCollector } from './system-collector.js';
+import { getProcessMemory } from '../utils/process-utils.js';
+/**
+ * Aggregates metrics from llama.cpp server API endpoints
+ * Combines server health, slot status, and model properties
+ */
+export class MetricsAggregator {
+  private serverUrl: string;
+  private timeout: number;
+  private previousSlots: Map<number, { n_decoded: number; timestamp: number }> = new Map();
+  constructor(server: ServerConfig, timeout: number = 5000) {
+    // Handle null host (legacy configs) by defaulting to 127.0.0.1
+    const host = server.host || '127.0.0.1';
+    this.serverUrl = `http://${host}:${server.port}`;
+    this.timeout = timeout;
+  }
+  /**
+   * Fetch data from llama.cpp API with timeout
+   */
+  private async fetchWithTimeout(
+    endpoint: string,
+    customTimeout?: number
+  ): Promise<any | null> {
+    try {
+      const controller = new AbortController();
+      const timeoutMs = customTimeout ?? this.timeout;
+      const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
+      const response = await fetch(`${this.serverUrl}${endpoint}`, {
+        signal: controller.signal,
+      });
+      clearTimeout(timeoutId);
+      if (!response.ok) {
+        return null;
+      }
+      return await response.json();
+    } catch (err) {
+      // Network error, timeout, or parse error
+      return null;
+    }
+  }
+  /**
+   * Get server health status
+   */
+  private async getHealth(): Promise<boolean> {
+    const health = await this.fetchWithTimeout('/health');
+    return health !== null && health.status === 'ok';
+  }
+  /**
+   * Get server properties (model info, context size, etc.)
+   */
+  private async getProps(): Promise<any> {
+    return await this.fetchWithTimeout('/props');
+  }
+  /**
+   * Get active slots information with calculated tok/s
+   */
+  private async getSlots(): Promise<SlotInfo[]> {
+    const data = await this.fetchWithTimeout('/slots');
+    if (!data || !Array.isArray(data)) {
+      return [];
+    }
+    const now = Date.now();
+    return data.map((slot: any) => {
+      const slotId = slot.id;
+      const n_decoded = slot.next_token?.[0]?.n_decoded || 0;
+      const isProcessing = slot.is_processing;
+      // Calculate tokens per second by comparing with previous poll
+      let predicted_per_second: number | undefined;
+      if (isProcessing && n_decoded > 0) {
+        const previous = this.previousSlots.get(slotId);
+        if (previous && previous.n_decoded < n_decoded) {
+          const tokensGenerated = n_decoded - previous.n_decoded;
+          const timeElapsed = (now - previous.timestamp) / 1000; // Convert to seconds
+          if (timeElapsed > 0) {
+            predicted_per_second = tokensGenerated / timeElapsed;
+          }
+        }
+        // Store current state for next comparison
+        this.previousSlots.set(slotId, { n_decoded, timestamp: now });
+      } else if (!isProcessing) {
+        // Clear history when slot becomes idle
+        this.previousSlots.delete(slotId);
+      }
+      return {
+        id: slotId,
+        state: isProcessing ? 'processing' : 'idle',
+        n_prompt_tokens: slot.n_prompt_tokens,
+        n_decoded,
+        n_ctx: slot.n_ctx || 0,
+        timings: predicted_per_second
+          ? {
+              prompt_n: 0,
+              prompt_ms: 0,
+              prompt_per_token_ms: 0,
+              prompt_per_second: 0,
+              predicted_n: n_decoded,
+              predicted_ms: 0,
+              predicted_per_token_ms: 0,
+              predicted_per_second,
+            }
+          : undefined,
+      };
+    });
+  }
+  /**
+   * Aggregate all server metrics
+   * @param server - Server configuration
+   * @param processMemory - Optional pre-fetched process memory (for batch collection)
+   */
+  async collectServerMetrics(
+    server: ServerConfig,
+    processMemory?: number | null
+  ): Promise<ServerMetrics> {
+    const now = Date.now();
+    // Check basic server status first
+    const status = await statusChecker.checkServer(server);
+    // Calculate uptime if server is running and has lastStarted
+    let uptime: string | undefined;
+    if (status.isRunning && server.lastStarted) {
+      const startTime = new Date(server.lastStarted).getTime();
+      const uptimeSeconds = Math.floor((now - startTime) / 1000);
+      const hours = Math.floor(uptimeSeconds / 3600);
+      const minutes = Math.floor((uptimeSeconds % 3600) / 60);
+      const seconds = uptimeSeconds % 60;
+      uptime = `${hours}h ${minutes}m ${seconds}s`;
+    }
+    // If server not running, return minimal data
+    if (!status.isRunning) {
+      return {
+        server,
+        healthy: false,
+        modelLoaded: false,
+        modelName: server.modelName,
+        contextSize: server.ctxSize,
+        totalSlots: 0,
+        activeSlots: 0,
+        idleSlots: 0,
+        slots: [],
+        timestamp: now,
+        stale: false,
+      };
+    }
+    // Fetch detailed metrics in parallel
+    // If processMemory was pre-fetched (batch mode), use it; otherwise fetch individually
+    const [healthy, props, slots, fetchedMemory] = await Promise.all([
+      this.getHealth(),
+      this.getProps(),
+      this.getSlots(),
+      processMemory !== undefined
+        ? Promise.resolve(processMemory)
+        : (server.pid ? getProcessMemory(server.pid) : Promise.resolve(null)),
+    ]);
+    // Calculate slot statistics
+    const activeSlots = slots.filter((s) => s.state === 'processing').length;
+    const idleSlots = slots.filter((s) => s.state === 'idle').length;
+    const totalSlots = props?.total_slots || slots.length;
+    // Calculate average speeds (only from processing slots)
+    const processingSlots = slots.filter((s) => s.state === 'processing' && s.timings);
+    const avgPromptSpeed =
+      processingSlots.length > 0
+        ? processingSlots.reduce(
+            (sum, s) => sum + (s.timings?.prompt_per_second || 0),
+            0
+          ) / processingSlots.length
+        : undefined;
+    const avgGenerateSpeed =
+      processingSlots.length > 0
+        ? processingSlots.reduce(
+            (sum, s) => sum + (s.timings?.predicted_per_second || 0),
+            0
+          ) / processingSlots.length
+        : undefined;
+    return {
+      server,
+      healthy,
+      uptime,
+      modelLoaded: props !== null,
+      modelName: server.modelName,
+      contextSize: props?.default_generation_settings?.n_ctx || server.ctxSize,
+      totalSlots,
+      activeSlots,
+      idleSlots,
+      slots,
+      avgPromptSpeed,
+      avgGenerateSpeed,
+      processMemory: fetchedMemory ?? undefined,
+      timestamp: now,
+      stale: false,
+    };
+  }
+  /**
+   * Collect complete monitoring data (server + system metrics)
+   */
+  async collectMonitorData(
+    server: ServerConfig,
+    updateInterval: number = 2000
+  ): Promise<MonitorData> {
+    // Collect server and system metrics in parallel
+    const [serverMetrics, systemMetrics] = await Promise.all([
+      this.collectServerMetrics(server),
+      systemCollector.collectSystemMetrics(),
+    ]);
+    return {
+      server: serverMetrics,
+      system: systemMetrics,
+      lastUpdated: new Date(),
+      updateInterval,
+      consecutiveFailures: 0,
+    };
+  }
+}

package/src/lib/system-collector.ts ADDED Viewed

@@ -0,0 +1,312 @@
+import { execCommand, spawnAndReadOneLine } from '../utils/process-utils.js';
+import { SystemMetrics } from '../types/monitor-types.js';
+/**
+ * System metrics collector using macmon (optional) and vm_stat (fallback)
+ * Provides GPU, CPU, ANE, and memory metrics on macOS
+ */
+export class SystemCollector {
+  private macmonPath: string;
+  private macmonAvailable: boolean | null = null;
+  private lastSystemMetrics: SystemMetrics | null = null;
+  private lastCollectionTime: number = 0;
+  private readonly CACHE_TTL_MS = 4000; // Cache for 4 seconds (longer than macmon spawn time)
+  private collectingLock: Promise<SystemMetrics> | null = null;
+  private pCoreCount: number = 0;
+  private eCoreCount: number = 0;
+  private totalCores: number = 0;
+  constructor(macmonPath: string = '/opt/homebrew/bin/macmon') {
+    this.macmonPath = macmonPath;
+    this.initializeCoreCount();
+  }
+  /**
+   * Get CPU core counts for weighted average calculation
+   */
+  private async initializeCoreCount(): Promise<void> {
+    try {
+      const { execCommand } = await import('../utils/process-utils.js');
+      // Try to get P-core and E-core counts separately (Apple Silicon)
+      try {
+        const pCores = await execCommand('sysctl -n hw.perflevel0.physicalcpu 2>/dev/null');
+        const eCores = await execCommand('sysctl -n hw.perflevel1.physicalcpu 2>/dev/null');
+        this.pCoreCount = parseInt(pCores, 10) || 0;
+        this.eCoreCount = parseInt(eCores, 10) || 0;
+      } catch {
+        // Fall back to total core count if perflevel not available
+        const total = await execCommand('sysctl -n hw.ncpu 2>/dev/null');
+        this.totalCores = parseInt(total, 10) || 0;
+        // Assume equal split if we can't get individual counts
+        this.pCoreCount = Math.floor(this.totalCores / 2);
+        this.eCoreCount = this.totalCores - this.pCoreCount;
+      }
+      this.totalCores = this.pCoreCount + this.eCoreCount;
+    } catch {
+      // Default to 8 cores if we can't detect
+      this.pCoreCount = 4;
+      this.eCoreCount = 4;
+      this.totalCores = 8;
+    }
+  }
+  /**
+   * Check if macmon is available
+   */
+  private async checkMacmonAvailability(): Promise<boolean> {
+    if (this.macmonAvailable !== null) {
+      return this.macmonAvailable;
+    }
+    try {
+      const result = await execCommand(`which ${this.macmonPath} 2>/dev/null`);
+      this.macmonAvailable = result.length > 0;
+    } catch {
+      this.macmonAvailable = false;
+    }
+    return this.macmonAvailable;
+  }
+  /**
+   * Parse macmon JSON output
+   * Expected format from 'macmon pipe':
+   * {
+   *   "gpu_usage": [count, percentage],
+   *   "pcpu_usage": [count, percentage],
+   *   "ecpu_usage": [count, percentage],
+   *   "ane_power": number,
+   *   "temp": {"cpu_temp_avg": number, "gpu_temp_avg": number}
+   * }
+   */
+  private parseMacmonJson(jsonLine: string): {
+    gpuUsage?: number;
+    cpuUsage?: number;
+    aneUsage?: number;
+    temperature?: number;
+  } {
+    try {
+      const data = JSON.parse(jsonLine);
+      // GPU usage (second element of array, convert decimal to percentage)
+      const gpuUsage = data.gpu_usage?.[1] !== undefined
+        ? data.gpu_usage[1] * 100
+        : undefined;
+      // CPU usage (weighted average of P-cores and E-cores)
+      // Each core type reports 0.0-1.0 utilization
+      // Calculate weighted average: (P% * Pcount + E% * Ecount) / totalCores
+      const pcpuUsage = data.pcpu_usage?.[1] || 0;  // 0.0-1.0
+      const ecpuUsage = data.ecpu_usage?.[1] || 0;  // 0.0-1.0
+      let cpuUsage: number | undefined;
+      if (this.totalCores > 0) {
+        // Weighted average normalized to 0-100%
+        cpuUsage = ((pcpuUsage * this.pCoreCount) + (ecpuUsage * this.eCoreCount)) / this.totalCores * 100;
+      } else {
+        // Fallback: simple average if core counts not available
+        cpuUsage = ((pcpuUsage + ecpuUsage) / 2) * 100;
+      }
+      // ANE usage (estimate from power draw - macmon doesn't provide usage %)
+      // If ANE power > 0.1W, consider it active (rough estimate)
+      const aneUsage = data.ane_power > 0.1
+        ? Math.min((data.ane_power / 8.0) * 100, 100) // Assume ~8W max for ANE
+        : 0;
+      // Temperature (use GPU temp if available, otherwise CPU)
+      const temperature = data.temp?.gpu_temp_avg || data.temp?.cpu_temp_avg;
+      return {
+        gpuUsage,
+        cpuUsage: cpuUsage > 0 ? cpuUsage : undefined,
+        aneUsage: aneUsage > 1 ? aneUsage : undefined,
+        temperature,
+      };
+    } catch {
+      return {};
+    }
+  }
+  /**
+   * Collect macmon metrics (GPU, CPU, ANE)
+   * Uses 'macmon pipe' which outputs one JSON line per update
+   * Spawns macmon, reads one line, and kills it to prevent process leaks
+   */
+  private async getMacmonMetrics(): Promise<{
+    gpuUsage?: number;
+    cpuUsage?: number;
+    aneUsage?: number;
+    temperature?: number;
+  } | null> {
+    const available = await this.checkMacmonAvailability();
+    if (!available) {
+      return null;
+    }
+    try {
+      // Spawn macmon pipe, read one line, and kill it
+      // This prevents orphaned macmon processes
+      // Timeout set to 5s because macmon can take 3-4s to produce first line
+      const output = await spawnAndReadOneLine(this.macmonPath, ['pipe'], 5000);
+      if (!output) {
+        return null;
+      }
+      return this.parseMacmonJson(output);
+    } catch {
+      return null;
+    }
+  }
+  /**
+   * Parse vm_stat output for memory metrics
+   * Expected format:
+   * Pages free:                               123456.
+   * Pages active:                             234567.
+   * Pages inactive:                           345678.
+   * Pages speculative:                        45678.
+   * Pages throttled:                          0.
+   * Pages wired down:                         123456.
+   * Pages purgeable count:                    0.
+   * "Translation faults":                     12345678.
+   * Pages copy-on-write:                      123456.
+   * ...
+   */
+  private parseVmStatOutput(output: string): {
+    memoryUsed: number;
+    memoryTotal: number;
+  } {
+    const lines = output.split('\n');
+    const pageSize = 16384; // 16KB on Apple Silicon
+    let pagesActive = 0;
+    let pagesWired = 0;
+    let pagesCompressed = 0;
+    let pagesFree = 0;
+    let pagesInactive = 0;
+    let pagesSpeculative = 0;
+    for (const line of lines) {
+      const match = line.match(/Pages (.*?):\s+(\d+)\./);
+      if (match) {
+        const name = match[1].toLowerCase();
+        const value = parseInt(match[2], 10);
+        if (name === 'active') pagesActive = value;
+        else if (name === 'wired down') pagesWired = value;
+        else if (name === 'compressed') pagesCompressed = value;
+        else if (name === 'free') pagesFree = value;
+        else if (name === 'inactive') pagesInactive = value;
+        else if (name === 'speculative') pagesSpeculative = value;
+      }
+    }
+    // Calculate used memory (active + wired + compressed)
+    const usedPages = pagesActive + pagesWired + pagesCompressed;
+    const memoryUsed = usedPages * pageSize;
+    // Calculate total memory (used + free + inactive + speculative)
+    const totalPages =
+      pagesActive +
+      pagesWired +
+      pagesCompressed +
+      pagesFree +
+      pagesInactive +
+      pagesSpeculative;
+    const memoryTotal = totalPages * pageSize;
+    return { memoryUsed, memoryTotal };
+  }
+  /**
+   * Collect vm_stat memory metrics
+   */
+  private async getVmStatMetrics(): Promise<{
+    memoryUsed: number;
+    memoryTotal: number;
+  }> {
+    try {
+      const output = await execCommand('vm_stat 2>/dev/null');
+      return this.parseVmStatOutput(output);
+    } catch {
+      // Fallback to zeros if vm_stat fails
+      return { memoryUsed: 0, memoryTotal: 0 };
+    }
+  }
+  /**
+   * Collect all system metrics
+   * Attempts macmon first (GPU/CPU/ANE + memory), falls back to vm_stat (memory only)
+   * Caches results for 1.5s to prevent spawning multiple macmon processes
+   */
+  async collectSystemMetrics(): Promise<SystemMetrics> {
+    const now = Date.now();
+    // Return cached data if still fresh
+    if (this.lastSystemMetrics && (now - this.lastCollectionTime) < this.CACHE_TTL_MS) {
+      return this.lastSystemMetrics;
+    }
+    // If already collecting, wait for that to finish
+    if (this.collectingLock) {
+      return this.collectingLock;
+    }
+    // Start fresh collection
+    this.collectingLock = this.doCollectSystemMetrics();
+    try {
+      const metrics = await this.collectingLock;
+      this.lastSystemMetrics = metrics;
+      this.lastCollectionTime = now;
+      return metrics;
+    } finally {
+      this.collectingLock = null;
+    }
+  }
+  /**
+   * Internal method to actually collect system metrics
+   * Called by collectSystemMetrics with caching/locking
+   */
+  private async doCollectSystemMetrics(): Promise<SystemMetrics> {
+    const warnings: string[] = [];
+    const now = Date.now();
+    // Try macmon first
+    const macmonMetrics = await this.getMacmonMetrics();
+    // Always get memory from vm_stat (more reliable than macmon)
+    const memoryMetrics = await this.getVmStatMetrics();
+    // Determine source and add warnings
+    let source: 'macmon' | 'vm_stat' | 'none';
+    if (macmonMetrics) {
+      source = 'macmon';
+    } else if (memoryMetrics.memoryTotal > 0) {
+      source = 'vm_stat';
+      warnings.push('macmon not available - showing memory metrics only');
+    } else {
+      source = 'none';
+      warnings.push('Unable to collect system metrics');
+    }
+    return {
+      gpuUsage: macmonMetrics?.gpuUsage,
+      cpuUsage: macmonMetrics?.cpuUsage,
+      aneUsage: macmonMetrics?.aneUsage,
+      temperature: macmonMetrics?.temperature,
+      memoryUsed: memoryMetrics.memoryUsed,
+      memoryTotal: memoryMetrics.memoryTotal,
+      timestamp: now,
+      source,
+      warnings: warnings.length > 0 ? warnings : undefined,
+    };
+  }
+}
+// Export singleton instance
+export const systemCollector = new SystemCollector();