saeeol 1.0.9 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/npm/bin/saeeol +42 -0
  2. package/npm/package.json +39 -0
  3. package/npm/postinstall.js +162 -0
  4. package/package.json +2 -2
  5. package/src/cli/cmd/mcp-refresh.ts +47 -0
  6. package/src/cli/cmd/mcp.ts +3 -1
  7. package/src/cli/cmd/tui/app-commands-core.tsx +11 -0
  8. package/src/cli/cmd/tui/app-commands-system.tsx +20 -0
  9. package/src/cli/cmd/tui/app-events.ts +43 -0
  10. package/src/cli/cmd/tui/app.tsx +4 -0
  11. package/src/cli/cmd/tui/component/dialog-model.tsx +2 -2
  12. package/src/cli/cmd/tui/component/prompt/use-prompt-memos.ts +1 -1
  13. package/src/cli/cmd/tui/component/use-connected.tsx +1 -1
  14. package/src/cli/cmd/tui/context/local.tsx +10 -3
  15. package/src/cli/cmd/tui/context/route.tsx +5 -1
  16. package/src/cli/cmd/tui/feature-plugins/sidebar/context.tsx +1 -1
  17. package/src/cli/cmd/tui/plugin/api.tsx +7 -3
  18. package/src/cli/cmd/tui/routes/local-models.tsx +151 -0
  19. package/src/cli/cmd/tui/routes/session/subagent-footer.tsx +1 -1
  20. package/src/cli/cmd/tui/util/model.ts +1 -1
  21. package/src/config/config-schema.ts +44 -0
  22. package/src/ltm/config.ts +124 -0
  23. package/src/ltm/events.ts +50 -0
  24. package/src/ltm/index.ts +12 -0
  25. package/src/ltm/memory/episodic.ts +83 -0
  26. package/src/ltm/memory/procedural.ts +102 -0
  27. package/src/ltm/memory/semantic.ts +80 -0
  28. package/src/ltm/pipeline.ts +155 -0
  29. package/src/ltm/retrieval.ts +62 -0
  30. package/src/ltm/scheduler.ts +55 -0
  31. package/src/ltm/store.ts +150 -0
  32. package/src/ltm/types.ts +108 -0
  33. package/src/mcp/index.ts +32 -1
  34. package/src/provider/custom-loaders.ts +12 -0
  35. package/src/provider/loader-local.ts +185 -0
  36. package/src/provider/local/embedder.ts +220 -0
  37. package/src/provider/local/events.ts +74 -0
  38. package/src/provider/local/gpu.ts +93 -0
  39. package/src/provider/local/hub.ts +174 -0
  40. package/src/provider/local/index.ts +10 -0
  41. package/src/provider/local/model-manager.ts +113 -0
  42. package/src/provider/local/orchestrator.ts +301 -0
  43. package/src/provider/local/rag.ts +112 -0
  44. package/src/provider/local/types.ts +142 -0
  45. package/src/provider/provider-conversion.ts +2 -0
  46. package/src/provider/provider-schema.ts +17 -2
  47. package/src/provider/provider-schemas.ts +10 -3
  48. package/src/provider/provider-state.ts +10 -2
  49. package/src/provider/provider.ts +2 -1
  50. package/src/saeeol/plugins/sidebar-usage.tsx +1 -1
  51. package/src/server/routes/instance/config.ts +1 -1
  52. package/src/server/routes/instance/httpapi/api.ts +2 -0
  53. package/src/server/routes/instance/httpapi/groups/local.ts +87 -0
  54. package/src/server/routes/instance/httpapi/groups/mcp.ts +10 -0
  55. package/src/server/routes/instance/httpapi/handlers/local.ts +95 -0
  56. package/src/server/routes/instance/httpapi/handlers/mcp.ts +5 -0
  57. package/src/server/routes/instance/httpapi/handlers/provider.ts +1 -1
  58. package/src/server/routes/instance/httpapi/server.ts +2 -0
  59. package/src/server/routes/instance/provider.ts +2 -2
  60. package/src/session/prompt-reminders.ts +29 -0
  61. package/test/fake/provider.ts +1 -0
  62. package/test/provider/local.test.ts +208 -0
  63. package/test/provider/provider-category.test.ts +190 -0
@@ -0,0 +1,174 @@
1
+ /** HuggingFace Hub client — search, browse, download models */
2
+
3
+ import { Effect } from "effect"
4
+ import * as Log from "@saeeol/core/util/log"
5
+ import { iife } from "@/util/iife"
6
+ import type { HFModelSearch, HFModelInfo, HFSibling, ModelArtifact, Quantization, ModelFormat, RAGAsset, RAGAssetType } from "./types"
7
+
8
+ const log = Log.create({ service: "local/hub" })
9
+ const API = "https://huggingface.co/api"
10
+
11
+ /** Search HuggingFace Hub for models */
12
+ export async function search(query: string, opts?: { limit?: number; tags?: string[] }): Promise<HFModelSearch[]> {
13
+ const params = new URLSearchParams({
14
+ search: query,
15
+ limit: String(opts?.limit ?? 20),
16
+ ...(opts?.tags?.length ? { tags: opts.tags.join(",") } : {}),
17
+ })
18
+
19
+ const res = await fetch(`${API}/models?${params}`)
20
+ if (!res.ok) throw new Error(`HF search failed: ${res.status} ${res.statusText}`)
21
+
22
+ const items = (await res.json()) as any[]
23
+ return items.map((item) => ({
24
+ id: item.id ?? item.modelId ?? "",
25
+ name: item.id ?? "",
26
+ author: item.author ?? "",
27
+ downloads: item.downloads ?? 0,
28
+ likes: item.likes ?? 0,
29
+ tags: item.tags ?? [],
30
+ pipelineTag: item.pipeline_tag ?? undefined,
31
+ libraryName: item.library_name ?? undefined,
32
+ }))
33
+ }
34
+
35
+ /** Get detailed model info from HF Hub */
36
+ export async function info(repo: string): Promise<HFModelInfo> {
37
+ const res = await fetch(`${API}/models/${repo}`)
38
+ if (!res.ok) throw new Error(`HF model info failed: ${res.status} ${res.statusText}`)
39
+
40
+ const data = (await res.json()) as any
41
+ return {
42
+ id: data.id ?? data.modelId ?? repo,
43
+ modelId: data.modelId ?? repo,
44
+ sha: data.sha ?? "",
45
+ siblings: (data.siblings ?? []) as HFSibling[],
46
+ tags: data.tags ?? [],
47
+ downloads: data.downloads ?? 0,
48
+ likes: data.likes ?? 0,
49
+ private: data.private ?? false,
50
+ }
51
+ }
52
+
53
+ /** List GGUF files in a repo */
54
+ export function listGGUF(siblings: HFSibling[]): string[] {
55
+ return siblings
56
+ .map((s) => s.rfilename)
57
+ .filter((f) => f.endsWith(".gguf"))
58
+ }
59
+
60
+ /** Parse quantization from GGUF filename */
61
+ export function parseQuant(filename: string): Quantization | undefined {
62
+ const lower = filename.toLowerCase()
63
+ const quants: Quantization[] = [
64
+ "q2_k", "q3_k_s", "q3_k_m", "q3_k_l",
65
+ "q4_0", "q4_1", "q4_k_s", "q4_k_m",
66
+ "q5_0", "q5_1", "q5_k_s", "q5_k_m",
67
+ "q6_k", "q8_0", "fp16", "bf16", "fp32",
68
+ ]
69
+ for (const q of quants) {
70
+ if (lower.includes(q)) return q
71
+ }
72
+ return undefined
73
+ }
74
+
75
+ /** Resolve download URL for a file in a repo */
76
+ export function downloadURL(repo: string, filename: string): string {
77
+ return `https://huggingface.co/${repo}/resolve/main/${filename}`
78
+ }
79
+
80
+ /** Build a ModelArtifact from repo + sibling info */
81
+ export function buildArtifact(
82
+ repo: string,
83
+ filename: string,
84
+ sizeBytes: number,
85
+ sha256?: string,
86
+ ): ModelArtifact {
87
+ const q = parseQuant(filename)
88
+ const ext = filename.split(".").pop() ?? ""
89
+ const formatMap: Record<string, ModelFormat> = {
90
+ gguf: "gguf",
91
+ safetensors: "safetensors",
92
+ bin: "pytorch",
93
+ pt: "pytorch",
94
+ onnx: "onnx",
95
+ }
96
+
97
+ return {
98
+ id: `${repo}/${filename}`,
99
+ repo,
100
+ filename,
101
+ format: formatMap[ext] ?? "pytorch",
102
+ quantization: q ?? "fp16",
103
+ sizeBytes,
104
+ sha256,
105
+ }
106
+ }
107
+
108
+ /** Download a file from HF Hub with progress callback */
109
+ export async function download(
110
+ repo: string,
111
+ filename: string,
112
+ destPath: string,
113
+ opts?: { onProgress?: (downloaded: number, total: number) => void; signal?: AbortSignal },
114
+ ): Promise<void> {
115
+ const url = downloadURL(repo, filename)
116
+
117
+ const res = await fetch(url, { signal: opts?.signal })
118
+ if (!res.ok) throw new Error(`HF download failed: ${res.status} ${res.statusText}`)
119
+ if (!res.body) throw new Error("No response body")
120
+
121
+ const total = Number(res.headers.get("content-length") ?? "0")
122
+ const file = Bun.file(destPath)
123
+ const writer = file.writer()
124
+
125
+ let downloaded = 0
126
+ const reader = res.body.getReader()
127
+
128
+ try {
129
+ while (true) {
130
+ const { done, value } = await reader.read()
131
+ if (done) break
132
+ writer.write(value)
133
+ downloaded += value.length
134
+ opts?.onProgress?.(downloaded, total)
135
+ }
136
+ } finally {
137
+ writer.end()
138
+ }
139
+
140
+ log.info("downloaded", { repo, filename, sizeMB: Math.round(downloaded / 1024 / 1024) })
141
+ }
142
+
143
+ /** Search for embedding/reranker models */
144
+ export async function searchRAG(
145
+ type: RAGAssetType,
146
+ query?: string,
147
+ limit?: number,
148
+ ): Promise<RAGAsset[]> {
149
+ const tagMap: Record<RAGAssetType, string> = {
150
+ embedding: "sentence-transformers",
151
+ reranker: "reranker",
152
+ vectordb: "vector",
153
+ }
154
+
155
+ const params = new URLSearchParams({
156
+ limit: String(limit ?? 20),
157
+ tags: tagMap[type],
158
+ ...(query ? { search: query } : {}),
159
+ })
160
+
161
+ const res = await fetch(`${API}/models?${params}`)
162
+ if (!res.ok) return []
163
+
164
+ const items = (await res.json()) as any[]
165
+ return items.map((item) => ({
166
+ id: item.id ?? "",
167
+ name: item.id ?? "",
168
+ type,
169
+ repo: item.id ?? "",
170
+ format: "safetensors" as ModelFormat,
171
+ sizeBytes: 0,
172
+ dimensions: undefined,
173
+ }))
174
+ }
@@ -0,0 +1,10 @@
1
+ /** Local model orchestration — public API */
2
+
3
+ export * from "./types"
4
+ export * as GPU from "./gpu"
5
+ export * as Hub from "./hub"
6
+ export * as Manager from "./model-manager"
7
+ export * as Orchestrator from "./orchestrator"
8
+ export * as RAG from "./rag"
9
+ export * as Embedder from "./embedder"
10
+ export { LocalModelEvent } from "./events"
@@ -0,0 +1,113 @@
1
+ /** Local model manager — install, remove, list, configure models on disk */
2
+
3
+ import path from "path"
4
+ import { mkdir, rm, readdir, stat } from "fs/promises"
5
+ import { Effect } from "effect"
6
+ import * as Log from "@saeeol/core/util/log"
7
+ import { Global } from "@saeeol/core/global"
8
+ import { iife } from "@/util/iife"
9
+ import type { ModelArtifact, BackendType } from "./types"
10
+ import * as Hub from "./hub"
11
+
12
+ const log = Log.create({ service: "local/manager" })
13
+
14
+ /** Root directory for all local models */
15
+ export function modelsDir(): string {
16
+ return path.join(Global.Path.data, "local-models")
17
+ }
18
+
19
+ /** Directory for a specific model artifact */
20
+ export function modelDir(artifact: ModelArtifact): string {
21
+ return path.join(modelsDir(), artifact.repo.replace("/", "__"), artifact.filename)
22
+ }
23
+
24
+ /** Directory for a specific backend's data */
25
+ export function backendDir(backend: BackendType): string {
26
+ return path.join(modelsDir(), "backends", backend)
27
+ }
28
+
29
+ /** Directory for RAG assets */
30
+ export function ragDir(): string {
31
+ return path.join(modelsDir(), "rag")
32
+ }
33
+
34
+ /** Ensure a directory exists */
35
+ async function ensure(dir: string): Promise<void> {
36
+ await mkdir(dir, { recursive: true })
37
+ }
38
+
39
+ /** Install a model from HuggingFace Hub */
40
+ export async function install(
41
+ artifact: ModelArtifact,
42
+ opts?: { onProgress?: (downloaded: number, total: number) => void; signal?: AbortSignal },
43
+ ): Promise<string> {
44
+ const dest = modelDir(artifact)
45
+ await ensure(path.dirname(dest))
46
+
47
+ const fullPath = path.join(dest, artifact.filename)
48
+ const exists = await stat(fullPath).catch(() => undefined)
49
+ if (exists?.size === artifact.sizeBytes) {
50
+ log.info("model already installed", { repo: artifact.repo, filename: artifact.filename })
51
+ return fullPath
52
+ }
53
+
54
+ log.info("installing model", { repo: artifact.repo, filename: artifact.filename })
55
+ await Hub.download(artifact.repo, artifact.filename, fullPath, opts)
56
+ return fullPath
57
+ }
58
+
59
+ /** Remove a model from disk */
60
+ export async function uninstall(artifact: ModelArtifact): Promise<void> {
61
+ const dir = modelDir(artifact)
62
+ await rm(dir, { recursive: true, force: true })
63
+ log.info("uninstalled model", { repo: artifact.repo, filename: artifact.filename })
64
+ }
65
+
66
+ /** List all installed models on disk */
67
+ export async function list(): Promise<Array<{ repo: string; filename: string; path: string; sizeBytes: number }>> {
68
+ const root = modelsDir()
69
+ const result: Array<{ repo: string; filename: string; path: string; sizeBytes: number }> = []
70
+
71
+ const entries = await readdir(root, { withFileTypes: true }).catch(() => [] as import("fs").Dirent[])
72
+ for (const entry of entries) {
73
+ if (!entry.isDirectory()) continue
74
+ if (entry.name === "backends" || entry.name === "rag") continue
75
+
76
+ const repo = entry.name.replace("__", "/")
77
+ const repoDir = path.join(root, entry.name)
78
+ const files = await readdir(repoDir).catch(() => [] as string[])
79
+
80
+ for (const file of files) {
81
+ const filePath = path.join(repoDir, file)
82
+ const s = await stat(filePath).catch(() => null)
83
+ if (s?.isFile()) {
84
+ result.push({ repo, filename: file, path: filePath, sizeBytes: s.size })
85
+ }
86
+ }
87
+ }
88
+
89
+ return result
90
+ }
91
+
92
+ /** Install a RAG asset (embedding model, reranker, vectordb) */
93
+ export async function installRAG(
94
+ asset: { repo: string; filename: string; sizeBytes: number },
95
+ opts?: { onProgress?: (downloaded: number, total: number) => void; signal?: AbortSignal },
96
+ ): Promise<string> {
97
+ const dir = path.join(ragDir(), asset.repo.replace("/", "__"))
98
+ await ensure(dir)
99
+
100
+ const fullPath = path.join(dir, asset.filename)
101
+ const exists = await stat(fullPath).catch(() => undefined)
102
+ if (exists?.size === asset.sizeBytes) return fullPath
103
+
104
+ await Hub.download(asset.repo, asset.filename, fullPath, opts)
105
+ return fullPath
106
+ }
107
+
108
+ /** Check if a model is installed */
109
+ export async function isInstalled(artifact: ModelArtifact): Promise<boolean> {
110
+ const fullPath = path.join(modelDir(artifact), artifact.filename)
111
+ const s = await stat(fullPath).catch(() => undefined)
112
+ return s?.isFile() === true && s.size > 0
113
+ }
@@ -0,0 +1,301 @@
1
+ /** Model orchestrator — run multiple models simultaneously with GPU-aware scheduling */
2
+
3
+ import { Effect } from "effect"
4
+ import * as Log from "@saeeol/core/util/log"
5
+ import { Process } from "@/util/process"
6
+ import { Global } from "@saeeol/core/global"
7
+ import type { ModelArtifact, ModelInstance, BackendType, BackendStatus, GPUProfile } from "./types"
8
+ import * as GPU from "./gpu"
9
+ import * as Manager from "./model-manager"
10
+ import { which } from "@/util/which"
11
+ import { iife } from "@/util/iife"
12
+ import path from "path"
13
+ import { mkdir } from "fs/promises"
14
+
15
+ const log = Log.create({ service: "local/orchestrator" })
16
+
17
+ const running = new Map<string, ModelInstance>()
18
+ let nextPort = 11435 // Start after default Ollama port
19
+
20
+ /** Detect which local backends are installed */
21
+ export async function detectBackends(): Promise<BackendStatus[]> {
22
+ const backends: Array<{ type: BackendType; cmds: string[]; defaultPort: number }> = [
23
+ { type: "ollama", cmds: ["ollama"], defaultPort: 11434 },
24
+ { type: "lmstudio", cmds: ["lms"], defaultPort: 1234 },
25
+ { type: "vllm", cmds: ["vllm", "python -m vllm"], defaultPort: 8000 },
26
+ { type: "llama.cpp", cmds: ["llama-server", "llama-cpp-server"], defaultPort: 8080 },
27
+ ]
28
+
29
+ const results: BackendStatus[] = []
30
+ for (const b of backends) {
31
+ let cmd = ""
32
+ let available = false
33
+ for (const c of b.cmds) {
34
+ const found = await which(c.split(" ")[0])
35
+ if (found) { cmd = c; available = true; break }
36
+ }
37
+
38
+ results.push({
39
+ type: b.type,
40
+ available,
41
+ endpoint: `http://localhost:${b.defaultPort}`,
42
+ version: undefined,
43
+ loadedModels: [],
44
+ })
45
+ }
46
+
47
+ return results
48
+ }
49
+
50
+ /** Check if a backend server is responding */
51
+ export async function isAlive(endpoint: string): Promise<boolean> {
52
+ try {
53
+ const res = await fetch(endpoint, { signal: AbortSignal.timeout(3000) })
54
+ return res.ok || res.status === 404 // 404 is fine, means server is up
55
+ } catch {
56
+ return false
57
+ }
58
+ }
59
+
60
+ /** Allocate a free port for a new model instance */
61
+ function allocPort(): number {
62
+ return nextPort++
63
+ }
64
+
65
+ /** Load a model via Ollama */
66
+ async function loadViaOllama(artifact: ModelArtifact, modelPath: string): Promise<ModelInstance> {
67
+ const port = 11434 // Ollama default
68
+
69
+ // Register model with Ollama if not already present
70
+ const modelfile = `FROM ${modelPath}`
71
+ const name = artifact.repo.replace("/", "-").toLowerCase()
72
+
73
+ const result = await Process.run(["ollama", "create", name, "-f", "-"])
74
+
75
+ log.info("ollama create result", { code: result.code })
76
+
77
+ return {
78
+ id: name,
79
+ artifact,
80
+ status: "running",
81
+ port,
82
+ endpoint: `http://localhost:${port}`,
83
+ }
84
+ }
85
+
86
+ /** Load a model via llama.cpp server */
87
+ async function loadViaLlamaCpp(artifact: ModelArtifact, modelPath: string, gpuIndex?: number): Promise<ModelInstance> {
88
+ const port = allocPort()
89
+
90
+ const args = ["-m", modelPath, "--port", String(port), "-c", "4096", "-ngl", "99"]
91
+ if (gpuIndex !== undefined) args.push("--gpu", String(gpuIndex))
92
+
93
+ const proc = Bun.spawn(["llama-server", ...args], {
94
+ stdout: "pipe",
95
+ stderr: "pipe",
96
+ windowsHide: true,
97
+ detached: true,
98
+ })
99
+
100
+ // Wait for server to start
101
+ for (let i = 0; i < 30; i++) {
102
+ await new Promise((r) => setTimeout(r, 1000))
103
+ if (await isAlive(`http://localhost:${port}`)) break
104
+ }
105
+
106
+ const instance: ModelInstance = {
107
+ id: `llamacpp-${port}`,
108
+ artifact,
109
+ status: "running",
110
+ pid: proc.pid,
111
+ port,
112
+ gpuIndex,
113
+ endpoint: `http://localhost:${port}/v1`,
114
+ }
115
+
116
+ running.set(instance.id, instance)
117
+ return instance
118
+ }
119
+
120
+ /** Load a model via vLLM */
121
+ async function loadViaVLLM(artifact: ModelArtifact, modelPath: string, gpuIndex?: number): Promise<ModelInstance> {
122
+ const port = allocPort()
123
+
124
+ const args = [
125
+ "serve", artifact.repo,
126
+ "--port", String(port),
127
+ "--served-model-name", artifact.repo,
128
+ "--trust-remote-code",
129
+ ]
130
+
131
+ const proc = Bun.spawn(["vllm", ...args], {
132
+ stdout: "pipe",
133
+ stderr: "pipe",
134
+ windowsHide: true,
135
+ detached: true,
136
+ env: {
137
+ ...process.env,
138
+ ...(gpuIndex !== undefined ? { CUDA_VISIBLE_DEVICES: String(gpuIndex) } : {}),
139
+ },
140
+ })
141
+
142
+ // vLLM takes longer to start
143
+ for (let i = 0; i < 120; i++) {
144
+ await new Promise((r) => setTimeout(r, 2000))
145
+ if (await isAlive(`http://localhost:${port}`)) break
146
+ }
147
+
148
+ const instance: ModelInstance = {
149
+ id: `vllm-${port}`,
150
+ artifact,
151
+ status: "running",
152
+ pid: proc.pid,
153
+ port,
154
+ gpuIndex,
155
+ endpoint: `http://localhost:${port}/v1`,
156
+ }
157
+
158
+ running.set(instance.id, instance)
159
+ return instance
160
+ }
161
+
162
+ /** Smart-load a model: pick the best backend and GPU automatically */
163
+ export async function load(
164
+ artifact: ModelArtifact,
165
+ opts?: { backend?: BackendType; gpuIndex?: number },
166
+ ): Promise<ModelInstance> {
167
+ const gpuProfile = await Effect.runPromise(GPU.profile)
168
+ const vramNeeded = GPU.estimateVRAM(artifact.sizeBytes, artifact.quantization)
169
+
170
+ // Pick GPU
171
+ const gpuIndex = opts?.gpuIndex ?? (() => {
172
+ const gpu = GPU.findBestGPU(gpuProfile, vramNeeded)
173
+ return gpu?.index
174
+ })()
175
+
176
+ if (gpuIndex === undefined && gpuProfile.cudaAvailable) {
177
+ log.warn("insufficient VRAM, model may run on CPU", { needed: vramNeeded, available: gpuProfile.availableVRAMMB })
178
+ }
179
+
180
+ // Install if needed
181
+ const modelPath = await Manager.install(artifact)
182
+
183
+ // Pick backend
184
+ const backends = await detectBackends()
185
+ const preferred = opts?.backend ?? iife(() => {
186
+ if (artifact.format === "gguf") {
187
+ if (backends.find((b) => b.type === "llama.cpp")?.available) return "llama.cpp" as BackendType
188
+ if (backends.find((b) => b.type === "ollama")?.available) return "ollama" as BackendType
189
+ }
190
+ if (backends.find((b) => b.type === "vllm")?.available) return "vllm" as BackendType
191
+ if (backends.find((b) => b.type === "ollama")?.available) return "ollama" as BackendType
192
+ return "llama.cpp" as BackendType
193
+ })
194
+
195
+ log.info("loading model", { repo: artifact.repo, backend: preferred, gpuIndex })
196
+
197
+ // Load via chosen backend
198
+ const instance: ModelInstance = {
199
+ id: `${preferred}-${artifact.repo}`,
200
+ artifact,
201
+ status: "starting",
202
+ gpuIndex,
203
+ vramUsageMB: vramNeeded,
204
+ }
205
+
206
+ running.set(instance.id, { ...instance })
207
+
208
+ switch (preferred) {
209
+ case "ollama":
210
+ return loadViaOllama(artifact, modelPath)
211
+ case "llama.cpp":
212
+ return loadViaLlamaCpp(artifact, modelPath, gpuIndex)
213
+ case "vllm":
214
+ return loadViaVLLM(artifact, modelPath, gpuIndex)
215
+ default:
216
+ return loadViaLlamaCpp(artifact, modelPath, gpuIndex)
217
+ }
218
+ }
219
+
220
+ /** Unload a running model */
221
+ export async function unload(instanceId: string): Promise<void> {
222
+ const inst = running.get(instanceId)
223
+ if (!inst) return
224
+
225
+ if (inst.pid) {
226
+ try { process.kill(inst.pid) } catch { /* already dead */ }
227
+ }
228
+
229
+ ;(inst as any).status = "stopped"
230
+ ;(inst as any).pid = undefined
231
+ running.delete(instanceId)
232
+ log.info("unloaded model", { id: instanceId })
233
+ }
234
+
235
+ /** List all running model instances */
236
+ export function runningInstances(): ModelInstance[] {
237
+ return [...running.values()]
238
+ }
239
+
240
+ /** Get a running instance by its endpoint */
241
+ export function findByEndpoint(url: string): ModelInstance | undefined {
242
+ return [...running.values()].find((i) => i.endpoint === url)
243
+ }
244
+
245
+ /** Load multiple models with GPU-aware scheduling */
246
+ export async function loadBatch(
247
+ artifacts: ModelArtifact[],
248
+ ): Promise<ModelInstance[]> {
249
+ const gpuProfile = await Effect.runPromise(GPU.profile)
250
+ const results: ModelInstance[] = []
251
+
252
+ // Sort by size descending — load largest first to ensure they get GPU space
253
+ const sorted = [...artifacts].sort((a, b) => b.sizeBytes - a.sizeBytes)
254
+
255
+ // Track per-GPU allocation
256
+ const gpuAlloc = new Map<number, number>() // gpuIndex -> usedMB
257
+ for (const g of gpuProfile.gpus) gpuAlloc.set(g.index, g.vramTotalMB - g.vramFreeMB)
258
+
259
+ for (const artifact of sorted) {
260
+ const vramNeeded = GPU.estimateVRAM(artifact.sizeBytes, artifact.quantization)
261
+
262
+ // Find GPU with most remaining space
263
+ let bestGpu: number | undefined
264
+ let bestFree = 0
265
+ for (const [idx, used] of gpuAlloc) {
266
+ const gpu = gpuProfile.gpus.find((g) => g.index === idx)
267
+ if (!gpu) continue
268
+ const free = gpu.vramTotalMB - used
269
+ if (free >= vramNeeded && free > bestFree) {
270
+ bestGpu = idx
271
+ bestFree = free
272
+ }
273
+ }
274
+
275
+ if (bestGpu !== undefined) {
276
+ gpuAlloc.set(bestGpu, (gpuAlloc.get(bestGpu) ?? 0) + vramNeeded)
277
+ }
278
+
279
+ try {
280
+ const instance = await load(artifact, { gpuIndex: bestGpu })
281
+ results.push(instance)
282
+ } catch (e) {
283
+ log.error("failed to load model in batch", { repo: artifact.repo, error: e })
284
+ results.push({
285
+ id: `error-${artifact.repo}`,
286
+ artifact,
287
+ status: "error",
288
+ error: e instanceof Error ? e.message : String(e),
289
+ })
290
+ }
291
+ }
292
+
293
+ return results
294
+ }
295
+
296
+ /** Unload all running models */
297
+ export async function unloadAll(): Promise<void> {
298
+ for (const id of [...running.keys()]) {
299
+ await unload(id)
300
+ }
301
+ }