saeeol 1.0.9 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/npm/bin/saeeol +42 -0
- package/npm/package.json +39 -0
- package/npm/postinstall.js +162 -0
- package/package.json +2 -2
- package/src/cli/cmd/mcp-refresh.ts +47 -0
- package/src/cli/cmd/mcp.ts +3 -1
- package/src/cli/cmd/tui/app-commands-core.tsx +11 -0
- package/src/cli/cmd/tui/app-commands-system.tsx +20 -0
- package/src/cli/cmd/tui/app-events.ts +43 -0
- package/src/cli/cmd/tui/app.tsx +4 -0
- package/src/cli/cmd/tui/component/dialog-model.tsx +2 -2
- package/src/cli/cmd/tui/component/prompt/use-prompt-memos.ts +1 -1
- package/src/cli/cmd/tui/component/use-connected.tsx +1 -1
- package/src/cli/cmd/tui/context/local.tsx +10 -3
- package/src/cli/cmd/tui/context/route.tsx +5 -1
- package/src/cli/cmd/tui/feature-plugins/sidebar/context.tsx +1 -1
- package/src/cli/cmd/tui/plugin/api.tsx +7 -3
- package/src/cli/cmd/tui/routes/local-models.tsx +151 -0
- package/src/cli/cmd/tui/routes/session/subagent-footer.tsx +1 -1
- package/src/cli/cmd/tui/util/model.ts +1 -1
- package/src/config/config-schema.ts +44 -0
- package/src/ltm/config.ts +124 -0
- package/src/ltm/events.ts +50 -0
- package/src/ltm/index.ts +12 -0
- package/src/ltm/memory/episodic.ts +83 -0
- package/src/ltm/memory/procedural.ts +102 -0
- package/src/ltm/memory/semantic.ts +80 -0
- package/src/ltm/pipeline.ts +155 -0
- package/src/ltm/retrieval.ts +62 -0
- package/src/ltm/scheduler.ts +55 -0
- package/src/ltm/store.ts +150 -0
- package/src/ltm/types.ts +108 -0
- package/src/mcp/index.ts +32 -1
- package/src/provider/custom-loaders.ts +12 -0
- package/src/provider/loader-local.ts +185 -0
- package/src/provider/local/embedder.ts +220 -0
- package/src/provider/local/events.ts +74 -0
- package/src/provider/local/gpu.ts +93 -0
- package/src/provider/local/hub.ts +174 -0
- package/src/provider/local/index.ts +10 -0
- package/src/provider/local/model-manager.ts +113 -0
- package/src/provider/local/orchestrator.ts +301 -0
- package/src/provider/local/rag.ts +112 -0
- package/src/provider/local/types.ts +142 -0
- package/src/provider/provider-conversion.ts +2 -0
- package/src/provider/provider-schema.ts +17 -2
- package/src/provider/provider-schemas.ts +10 -3
- package/src/provider/provider-state.ts +10 -2
- package/src/provider/provider.ts +2 -1
- package/src/saeeol/plugins/sidebar-usage.tsx +1 -1
- package/src/server/routes/instance/config.ts +1 -1
- package/src/server/routes/instance/httpapi/api.ts +2 -0
- package/src/server/routes/instance/httpapi/groups/local.ts +87 -0
- package/src/server/routes/instance/httpapi/groups/mcp.ts +10 -0
- package/src/server/routes/instance/httpapi/handlers/local.ts +95 -0
- package/src/server/routes/instance/httpapi/handlers/mcp.ts +5 -0
- package/src/server/routes/instance/httpapi/handlers/provider.ts +1 -1
- package/src/server/routes/instance/httpapi/server.ts +2 -0
- package/src/server/routes/instance/provider.ts +2 -2
- package/src/session/prompt-reminders.ts +29 -0
- package/test/fake/provider.ts +1 -0
- package/test/provider/local.test.ts +208 -0
- package/test/provider/provider-category.test.ts +190 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/** HuggingFace Hub client — search, browse, download models */
|
|
2
|
+
|
|
3
|
+
import { Effect } from "effect"
|
|
4
|
+
import * as Log from "@saeeol/core/util/log"
|
|
5
|
+
import { iife } from "@/util/iife"
|
|
6
|
+
import type { HFModelSearch, HFModelInfo, HFSibling, ModelArtifact, Quantization, ModelFormat, RAGAsset, RAGAssetType } from "./types"
|
|
7
|
+
|
|
8
|
+
const log = Log.create({ service: "local/hub" })
|
|
9
|
+
const API = "https://huggingface.co/api"
|
|
10
|
+
|
|
11
|
+
/** Search HuggingFace Hub for models */
|
|
12
|
+
export async function search(query: string, opts?: { limit?: number; tags?: string[] }): Promise<HFModelSearch[]> {
|
|
13
|
+
const params = new URLSearchParams({
|
|
14
|
+
search: query,
|
|
15
|
+
limit: String(opts?.limit ?? 20),
|
|
16
|
+
...(opts?.tags?.length ? { tags: opts.tags.join(",") } : {}),
|
|
17
|
+
})
|
|
18
|
+
|
|
19
|
+
const res = await fetch(`${API}/models?${params}`)
|
|
20
|
+
if (!res.ok) throw new Error(`HF search failed: ${res.status} ${res.statusText}`)
|
|
21
|
+
|
|
22
|
+
const items = (await res.json()) as any[]
|
|
23
|
+
return items.map((item) => ({
|
|
24
|
+
id: item.id ?? item.modelId ?? "",
|
|
25
|
+
name: item.id ?? "",
|
|
26
|
+
author: item.author ?? "",
|
|
27
|
+
downloads: item.downloads ?? 0,
|
|
28
|
+
likes: item.likes ?? 0,
|
|
29
|
+
tags: item.tags ?? [],
|
|
30
|
+
pipelineTag: item.pipeline_tag ?? undefined,
|
|
31
|
+
libraryName: item.library_name ?? undefined,
|
|
32
|
+
}))
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Get detailed model info from HF Hub */
|
|
36
|
+
export async function info(repo: string): Promise<HFModelInfo> {
|
|
37
|
+
const res = await fetch(`${API}/models/${repo}`)
|
|
38
|
+
if (!res.ok) throw new Error(`HF model info failed: ${res.status} ${res.statusText}`)
|
|
39
|
+
|
|
40
|
+
const data = (await res.json()) as any
|
|
41
|
+
return {
|
|
42
|
+
id: data.id ?? data.modelId ?? repo,
|
|
43
|
+
modelId: data.modelId ?? repo,
|
|
44
|
+
sha: data.sha ?? "",
|
|
45
|
+
siblings: (data.siblings ?? []) as HFSibling[],
|
|
46
|
+
tags: data.tags ?? [],
|
|
47
|
+
downloads: data.downloads ?? 0,
|
|
48
|
+
likes: data.likes ?? 0,
|
|
49
|
+
private: data.private ?? false,
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** List GGUF files in a repo */
|
|
54
|
+
export function listGGUF(siblings: HFSibling[]): string[] {
|
|
55
|
+
return siblings
|
|
56
|
+
.map((s) => s.rfilename)
|
|
57
|
+
.filter((f) => f.endsWith(".gguf"))
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/** Parse quantization from GGUF filename */
|
|
61
|
+
export function parseQuant(filename: string): Quantization | undefined {
|
|
62
|
+
const lower = filename.toLowerCase()
|
|
63
|
+
const quants: Quantization[] = [
|
|
64
|
+
"q2_k", "q3_k_s", "q3_k_m", "q3_k_l",
|
|
65
|
+
"q4_0", "q4_1", "q4_k_s", "q4_k_m",
|
|
66
|
+
"q5_0", "q5_1", "q5_k_s", "q5_k_m",
|
|
67
|
+
"q6_k", "q8_0", "fp16", "bf16", "fp32",
|
|
68
|
+
]
|
|
69
|
+
for (const q of quants) {
|
|
70
|
+
if (lower.includes(q)) return q
|
|
71
|
+
}
|
|
72
|
+
return undefined
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Resolve download URL for a file in a repo */
|
|
76
|
+
export function downloadURL(repo: string, filename: string): string {
|
|
77
|
+
return `https://huggingface.co/${repo}/resolve/main/${filename}`
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Build a ModelArtifact from repo + sibling info */
|
|
81
|
+
export function buildArtifact(
|
|
82
|
+
repo: string,
|
|
83
|
+
filename: string,
|
|
84
|
+
sizeBytes: number,
|
|
85
|
+
sha256?: string,
|
|
86
|
+
): ModelArtifact {
|
|
87
|
+
const q = parseQuant(filename)
|
|
88
|
+
const ext = filename.split(".").pop() ?? ""
|
|
89
|
+
const formatMap: Record<string, ModelFormat> = {
|
|
90
|
+
gguf: "gguf",
|
|
91
|
+
safetensors: "safetensors",
|
|
92
|
+
bin: "pytorch",
|
|
93
|
+
pt: "pytorch",
|
|
94
|
+
onnx: "onnx",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
id: `${repo}/${filename}`,
|
|
99
|
+
repo,
|
|
100
|
+
filename,
|
|
101
|
+
format: formatMap[ext] ?? "pytorch",
|
|
102
|
+
quantization: q ?? "fp16",
|
|
103
|
+
sizeBytes,
|
|
104
|
+
sha256,
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/** Download a file from HF Hub with progress callback */
|
|
109
|
+
export async function download(
|
|
110
|
+
repo: string,
|
|
111
|
+
filename: string,
|
|
112
|
+
destPath: string,
|
|
113
|
+
opts?: { onProgress?: (downloaded: number, total: number) => void; signal?: AbortSignal },
|
|
114
|
+
): Promise<void> {
|
|
115
|
+
const url = downloadURL(repo, filename)
|
|
116
|
+
|
|
117
|
+
const res = await fetch(url, { signal: opts?.signal })
|
|
118
|
+
if (!res.ok) throw new Error(`HF download failed: ${res.status} ${res.statusText}`)
|
|
119
|
+
if (!res.body) throw new Error("No response body")
|
|
120
|
+
|
|
121
|
+
const total = Number(res.headers.get("content-length") ?? "0")
|
|
122
|
+
const file = Bun.file(destPath)
|
|
123
|
+
const writer = file.writer()
|
|
124
|
+
|
|
125
|
+
let downloaded = 0
|
|
126
|
+
const reader = res.body.getReader()
|
|
127
|
+
|
|
128
|
+
try {
|
|
129
|
+
while (true) {
|
|
130
|
+
const { done, value } = await reader.read()
|
|
131
|
+
if (done) break
|
|
132
|
+
writer.write(value)
|
|
133
|
+
downloaded += value.length
|
|
134
|
+
opts?.onProgress?.(downloaded, total)
|
|
135
|
+
}
|
|
136
|
+
} finally {
|
|
137
|
+
writer.end()
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
log.info("downloaded", { repo, filename, sizeMB: Math.round(downloaded / 1024 / 1024) })
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/** Search for embedding/reranker models */
|
|
144
|
+
export async function searchRAG(
|
|
145
|
+
type: RAGAssetType,
|
|
146
|
+
query?: string,
|
|
147
|
+
limit?: number,
|
|
148
|
+
): Promise<RAGAsset[]> {
|
|
149
|
+
const tagMap: Record<RAGAssetType, string> = {
|
|
150
|
+
embedding: "sentence-transformers",
|
|
151
|
+
reranker: "reranker",
|
|
152
|
+
vectordb: "vector",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const params = new URLSearchParams({
|
|
156
|
+
limit: String(limit ?? 20),
|
|
157
|
+
tags: tagMap[type],
|
|
158
|
+
...(query ? { search: query } : {}),
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
const res = await fetch(`${API}/models?${params}`)
|
|
162
|
+
if (!res.ok) return []
|
|
163
|
+
|
|
164
|
+
const items = (await res.json()) as any[]
|
|
165
|
+
return items.map((item) => ({
|
|
166
|
+
id: item.id ?? "",
|
|
167
|
+
name: item.id ?? "",
|
|
168
|
+
type,
|
|
169
|
+
repo: item.id ?? "",
|
|
170
|
+
format: "safetensors" as ModelFormat,
|
|
171
|
+
sizeBytes: 0,
|
|
172
|
+
dimensions: undefined,
|
|
173
|
+
}))
|
|
174
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/** Local model orchestration — public API */
|
|
2
|
+
|
|
3
|
+
export * from "./types"
|
|
4
|
+
export * as GPU from "./gpu"
|
|
5
|
+
export * as Hub from "./hub"
|
|
6
|
+
export * as Manager from "./model-manager"
|
|
7
|
+
export * as Orchestrator from "./orchestrator"
|
|
8
|
+
export * as RAG from "./rag"
|
|
9
|
+
export * as Embedder from "./embedder"
|
|
10
|
+
export { LocalModelEvent } from "./events"
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
/** Local model manager — install, remove, list, configure models on disk */
|
|
2
|
+
|
|
3
|
+
import path from "path"
|
|
4
|
+
import { mkdir, rm, readdir, stat } from "fs/promises"
|
|
5
|
+
import { Effect } from "effect"
|
|
6
|
+
import * as Log from "@saeeol/core/util/log"
|
|
7
|
+
import { Global } from "@saeeol/core/global"
|
|
8
|
+
import { iife } from "@/util/iife"
|
|
9
|
+
import type { ModelArtifact, BackendType } from "./types"
|
|
10
|
+
import * as Hub from "./hub"
|
|
11
|
+
|
|
12
|
+
const log = Log.create({ service: "local/manager" })
|
|
13
|
+
|
|
14
|
+
/** Root directory for all local models */
|
|
15
|
+
export function modelsDir(): string {
|
|
16
|
+
return path.join(Global.Path.data, "local-models")
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Directory for a specific model artifact */
|
|
20
|
+
export function modelDir(artifact: ModelArtifact): string {
|
|
21
|
+
return path.join(modelsDir(), artifact.repo.replace("/", "__"), artifact.filename)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Directory for a specific backend's data */
|
|
25
|
+
export function backendDir(backend: BackendType): string {
|
|
26
|
+
return path.join(modelsDir(), "backends", backend)
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Directory for RAG assets */
|
|
30
|
+
export function ragDir(): string {
|
|
31
|
+
return path.join(modelsDir(), "rag")
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Ensure a directory exists */
|
|
35
|
+
async function ensure(dir: string): Promise<void> {
|
|
36
|
+
await mkdir(dir, { recursive: true })
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Install a model from HuggingFace Hub */
|
|
40
|
+
export async function install(
|
|
41
|
+
artifact: ModelArtifact,
|
|
42
|
+
opts?: { onProgress?: (downloaded: number, total: number) => void; signal?: AbortSignal },
|
|
43
|
+
): Promise<string> {
|
|
44
|
+
const dest = modelDir(artifact)
|
|
45
|
+
await ensure(path.dirname(dest))
|
|
46
|
+
|
|
47
|
+
const fullPath = path.join(dest, artifact.filename)
|
|
48
|
+
const exists = await stat(fullPath).catch(() => undefined)
|
|
49
|
+
if (exists?.size === artifact.sizeBytes) {
|
|
50
|
+
log.info("model already installed", { repo: artifact.repo, filename: artifact.filename })
|
|
51
|
+
return fullPath
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
log.info("installing model", { repo: artifact.repo, filename: artifact.filename })
|
|
55
|
+
await Hub.download(artifact.repo, artifact.filename, fullPath, opts)
|
|
56
|
+
return fullPath
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Remove a model from disk */
|
|
60
|
+
export async function uninstall(artifact: ModelArtifact): Promise<void> {
|
|
61
|
+
const dir = modelDir(artifact)
|
|
62
|
+
await rm(dir, { recursive: true, force: true })
|
|
63
|
+
log.info("uninstalled model", { repo: artifact.repo, filename: artifact.filename })
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** List all installed models on disk */
|
|
67
|
+
export async function list(): Promise<Array<{ repo: string; filename: string; path: string; sizeBytes: number }>> {
|
|
68
|
+
const root = modelsDir()
|
|
69
|
+
const result: Array<{ repo: string; filename: string; path: string; sizeBytes: number }> = []
|
|
70
|
+
|
|
71
|
+
const entries = await readdir(root, { withFileTypes: true }).catch(() => [] as import("fs").Dirent[])
|
|
72
|
+
for (const entry of entries) {
|
|
73
|
+
if (!entry.isDirectory()) continue
|
|
74
|
+
if (entry.name === "backends" || entry.name === "rag") continue
|
|
75
|
+
|
|
76
|
+
const repo = entry.name.replace("__", "/")
|
|
77
|
+
const repoDir = path.join(root, entry.name)
|
|
78
|
+
const files = await readdir(repoDir).catch(() => [] as string[])
|
|
79
|
+
|
|
80
|
+
for (const file of files) {
|
|
81
|
+
const filePath = path.join(repoDir, file)
|
|
82
|
+
const s = await stat(filePath).catch(() => null)
|
|
83
|
+
if (s?.isFile()) {
|
|
84
|
+
result.push({ repo, filename: file, path: filePath, sizeBytes: s.size })
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return result
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/** Install a RAG asset (embedding model, reranker, vectordb) */
|
|
93
|
+
export async function installRAG(
|
|
94
|
+
asset: { repo: string; filename: string; sizeBytes: number },
|
|
95
|
+
opts?: { onProgress?: (downloaded: number, total: number) => void; signal?: AbortSignal },
|
|
96
|
+
): Promise<string> {
|
|
97
|
+
const dir = path.join(ragDir(), asset.repo.replace("/", "__"))
|
|
98
|
+
await ensure(dir)
|
|
99
|
+
|
|
100
|
+
const fullPath = path.join(dir, asset.filename)
|
|
101
|
+
const exists = await stat(fullPath).catch(() => undefined)
|
|
102
|
+
if (exists?.size === asset.sizeBytes) return fullPath
|
|
103
|
+
|
|
104
|
+
await Hub.download(asset.repo, asset.filename, fullPath, opts)
|
|
105
|
+
return fullPath
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/** Check if a model is installed */
|
|
109
|
+
export async function isInstalled(artifact: ModelArtifact): Promise<boolean> {
|
|
110
|
+
const fullPath = path.join(modelDir(artifact), artifact.filename)
|
|
111
|
+
const s = await stat(fullPath).catch(() => undefined)
|
|
112
|
+
return s?.isFile() === true && s.size > 0
|
|
113
|
+
}
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
/** Model orchestrator — run multiple models simultaneously with GPU-aware scheduling */
|
|
2
|
+
|
|
3
|
+
import { Effect } from "effect"
|
|
4
|
+
import * as Log from "@saeeol/core/util/log"
|
|
5
|
+
import { Process } from "@/util/process"
|
|
6
|
+
import { Global } from "@saeeol/core/global"
|
|
7
|
+
import type { ModelArtifact, ModelInstance, BackendType, BackendStatus, GPUProfile } from "./types"
|
|
8
|
+
import * as GPU from "./gpu"
|
|
9
|
+
import * as Manager from "./model-manager"
|
|
10
|
+
import { which } from "@/util/which"
|
|
11
|
+
import { iife } from "@/util/iife"
|
|
12
|
+
import path from "path"
|
|
13
|
+
import { mkdir } from "fs/promises"
|
|
14
|
+
|
|
15
|
+
const log = Log.create({ service: "local/orchestrator" })
|
|
16
|
+
|
|
17
|
+
const running = new Map<string, ModelInstance>()
|
|
18
|
+
let nextPort = 11435 // Start after default Ollama port
|
|
19
|
+
|
|
20
|
+
/** Detect which local backends are installed */
|
|
21
|
+
export async function detectBackends(): Promise<BackendStatus[]> {
|
|
22
|
+
const backends: Array<{ type: BackendType; cmds: string[]; defaultPort: number }> = [
|
|
23
|
+
{ type: "ollama", cmds: ["ollama"], defaultPort: 11434 },
|
|
24
|
+
{ type: "lmstudio", cmds: ["lms"], defaultPort: 1234 },
|
|
25
|
+
{ type: "vllm", cmds: ["vllm", "python -m vllm"], defaultPort: 8000 },
|
|
26
|
+
{ type: "llama.cpp", cmds: ["llama-server", "llama-cpp-server"], defaultPort: 8080 },
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
const results: BackendStatus[] = []
|
|
30
|
+
for (const b of backends) {
|
|
31
|
+
let cmd = ""
|
|
32
|
+
let available = false
|
|
33
|
+
for (const c of b.cmds) {
|
|
34
|
+
const found = await which(c.split(" ")[0])
|
|
35
|
+
if (found) { cmd = c; available = true; break }
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
results.push({
|
|
39
|
+
type: b.type,
|
|
40
|
+
available,
|
|
41
|
+
endpoint: `http://localhost:${b.defaultPort}`,
|
|
42
|
+
version: undefined,
|
|
43
|
+
loadedModels: [],
|
|
44
|
+
})
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return results
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** Check if a backend server is responding */
|
|
51
|
+
export async function isAlive(endpoint: string): Promise<boolean> {
|
|
52
|
+
try {
|
|
53
|
+
const res = await fetch(endpoint, { signal: AbortSignal.timeout(3000) })
|
|
54
|
+
return res.ok || res.status === 404 // 404 is fine, means server is up
|
|
55
|
+
} catch {
|
|
56
|
+
return false
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/** Allocate a free port for a new model instance */
|
|
61
|
+
function allocPort(): number {
|
|
62
|
+
return nextPort++
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Load a model via Ollama */
|
|
66
|
+
async function loadViaOllama(artifact: ModelArtifact, modelPath: string): Promise<ModelInstance> {
|
|
67
|
+
const port = 11434 // Ollama default
|
|
68
|
+
|
|
69
|
+
// Register model with Ollama if not already present
|
|
70
|
+
const modelfile = `FROM ${modelPath}`
|
|
71
|
+
const name = artifact.repo.replace("/", "-").toLowerCase()
|
|
72
|
+
|
|
73
|
+
const result = await Process.run(["ollama", "create", name, "-f", "-"])
|
|
74
|
+
|
|
75
|
+
log.info("ollama create result", { code: result.code })
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
id: name,
|
|
79
|
+
artifact,
|
|
80
|
+
status: "running",
|
|
81
|
+
port,
|
|
82
|
+
endpoint: `http://localhost:${port}`,
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** Load a model via llama.cpp server */
|
|
87
|
+
async function loadViaLlamaCpp(artifact: ModelArtifact, modelPath: string, gpuIndex?: number): Promise<ModelInstance> {
|
|
88
|
+
const port = allocPort()
|
|
89
|
+
|
|
90
|
+
const args = ["-m", modelPath, "--port", String(port), "-c", "4096", "-ngl", "99"]
|
|
91
|
+
if (gpuIndex !== undefined) args.push("--gpu", String(gpuIndex))
|
|
92
|
+
|
|
93
|
+
const proc = Bun.spawn(["llama-server", ...args], {
|
|
94
|
+
stdout: "pipe",
|
|
95
|
+
stderr: "pipe",
|
|
96
|
+
windowsHide: true,
|
|
97
|
+
detached: true,
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
// Wait for server to start
|
|
101
|
+
for (let i = 0; i < 30; i++) {
|
|
102
|
+
await new Promise((r) => setTimeout(r, 1000))
|
|
103
|
+
if (await isAlive(`http://localhost:${port}`)) break
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const instance: ModelInstance = {
|
|
107
|
+
id: `llamacpp-${port}`,
|
|
108
|
+
artifact,
|
|
109
|
+
status: "running",
|
|
110
|
+
pid: proc.pid,
|
|
111
|
+
port,
|
|
112
|
+
gpuIndex,
|
|
113
|
+
endpoint: `http://localhost:${port}/v1`,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
running.set(instance.id, instance)
|
|
117
|
+
return instance
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/** Load a model via vLLM */
|
|
121
|
+
async function loadViaVLLM(artifact: ModelArtifact, modelPath: string, gpuIndex?: number): Promise<ModelInstance> {
|
|
122
|
+
const port = allocPort()
|
|
123
|
+
|
|
124
|
+
const args = [
|
|
125
|
+
"serve", artifact.repo,
|
|
126
|
+
"--port", String(port),
|
|
127
|
+
"--served-model-name", artifact.repo,
|
|
128
|
+
"--trust-remote-code",
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
const proc = Bun.spawn(["vllm", ...args], {
|
|
132
|
+
stdout: "pipe",
|
|
133
|
+
stderr: "pipe",
|
|
134
|
+
windowsHide: true,
|
|
135
|
+
detached: true,
|
|
136
|
+
env: {
|
|
137
|
+
...process.env,
|
|
138
|
+
...(gpuIndex !== undefined ? { CUDA_VISIBLE_DEVICES: String(gpuIndex) } : {}),
|
|
139
|
+
},
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
// vLLM takes longer to start
|
|
143
|
+
for (let i = 0; i < 120; i++) {
|
|
144
|
+
await new Promise((r) => setTimeout(r, 2000))
|
|
145
|
+
if (await isAlive(`http://localhost:${port}`)) break
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const instance: ModelInstance = {
|
|
149
|
+
id: `vllm-${port}`,
|
|
150
|
+
artifact,
|
|
151
|
+
status: "running",
|
|
152
|
+
pid: proc.pid,
|
|
153
|
+
port,
|
|
154
|
+
gpuIndex,
|
|
155
|
+
endpoint: `http://localhost:${port}/v1`,
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
running.set(instance.id, instance)
|
|
159
|
+
return instance
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/** Smart-load a model: pick the best backend and GPU automatically */
|
|
163
|
+
export async function load(
|
|
164
|
+
artifact: ModelArtifact,
|
|
165
|
+
opts?: { backend?: BackendType; gpuIndex?: number },
|
|
166
|
+
): Promise<ModelInstance> {
|
|
167
|
+
const gpuProfile = await Effect.runPromise(GPU.profile)
|
|
168
|
+
const vramNeeded = GPU.estimateVRAM(artifact.sizeBytes, artifact.quantization)
|
|
169
|
+
|
|
170
|
+
// Pick GPU
|
|
171
|
+
const gpuIndex = opts?.gpuIndex ?? (() => {
|
|
172
|
+
const gpu = GPU.findBestGPU(gpuProfile, vramNeeded)
|
|
173
|
+
return gpu?.index
|
|
174
|
+
})()
|
|
175
|
+
|
|
176
|
+
if (gpuIndex === undefined && gpuProfile.cudaAvailable) {
|
|
177
|
+
log.warn("insufficient VRAM, model may run on CPU", { needed: vramNeeded, available: gpuProfile.availableVRAMMB })
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Install if needed
|
|
181
|
+
const modelPath = await Manager.install(artifact)
|
|
182
|
+
|
|
183
|
+
// Pick backend
|
|
184
|
+
const backends = await detectBackends()
|
|
185
|
+
const preferred = opts?.backend ?? iife(() => {
|
|
186
|
+
if (artifact.format === "gguf") {
|
|
187
|
+
if (backends.find((b) => b.type === "llama.cpp")?.available) return "llama.cpp" as BackendType
|
|
188
|
+
if (backends.find((b) => b.type === "ollama")?.available) return "ollama" as BackendType
|
|
189
|
+
}
|
|
190
|
+
if (backends.find((b) => b.type === "vllm")?.available) return "vllm" as BackendType
|
|
191
|
+
if (backends.find((b) => b.type === "ollama")?.available) return "ollama" as BackendType
|
|
192
|
+
return "llama.cpp" as BackendType
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
log.info("loading model", { repo: artifact.repo, backend: preferred, gpuIndex })
|
|
196
|
+
|
|
197
|
+
// Load via chosen backend
|
|
198
|
+
const instance: ModelInstance = {
|
|
199
|
+
id: `${preferred}-${artifact.repo}`,
|
|
200
|
+
artifact,
|
|
201
|
+
status: "starting",
|
|
202
|
+
gpuIndex,
|
|
203
|
+
vramUsageMB: vramNeeded,
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
running.set(instance.id, { ...instance })
|
|
207
|
+
|
|
208
|
+
switch (preferred) {
|
|
209
|
+
case "ollama":
|
|
210
|
+
return loadViaOllama(artifact, modelPath)
|
|
211
|
+
case "llama.cpp":
|
|
212
|
+
return loadViaLlamaCpp(artifact, modelPath, gpuIndex)
|
|
213
|
+
case "vllm":
|
|
214
|
+
return loadViaVLLM(artifact, modelPath, gpuIndex)
|
|
215
|
+
default:
|
|
216
|
+
return loadViaLlamaCpp(artifact, modelPath, gpuIndex)
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/** Unload a running model */
|
|
221
|
+
export async function unload(instanceId: string): Promise<void> {
|
|
222
|
+
const inst = running.get(instanceId)
|
|
223
|
+
if (!inst) return
|
|
224
|
+
|
|
225
|
+
if (inst.pid) {
|
|
226
|
+
try { process.kill(inst.pid) } catch { /* already dead */ }
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
;(inst as any).status = "stopped"
|
|
230
|
+
;(inst as any).pid = undefined
|
|
231
|
+
running.delete(instanceId)
|
|
232
|
+
log.info("unloaded model", { id: instanceId })
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/** List all running model instances */
|
|
236
|
+
export function runningInstances(): ModelInstance[] {
|
|
237
|
+
return [...running.values()]
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/** Get a running instance by its endpoint */
|
|
241
|
+
export function findByEndpoint(url: string): ModelInstance | undefined {
|
|
242
|
+
return [...running.values()].find((i) => i.endpoint === url)
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
/** Load multiple models with GPU-aware scheduling */
|
|
246
|
+
export async function loadBatch(
|
|
247
|
+
artifacts: ModelArtifact[],
|
|
248
|
+
): Promise<ModelInstance[]> {
|
|
249
|
+
const gpuProfile = await Effect.runPromise(GPU.profile)
|
|
250
|
+
const results: ModelInstance[] = []
|
|
251
|
+
|
|
252
|
+
// Sort by size descending — load largest first to ensure they get GPU space
|
|
253
|
+
const sorted = [...artifacts].sort((a, b) => b.sizeBytes - a.sizeBytes)
|
|
254
|
+
|
|
255
|
+
// Track per-GPU allocation
|
|
256
|
+
const gpuAlloc = new Map<number, number>() // gpuIndex -> usedMB
|
|
257
|
+
for (const g of gpuProfile.gpus) gpuAlloc.set(g.index, g.vramTotalMB - g.vramFreeMB)
|
|
258
|
+
|
|
259
|
+
for (const artifact of sorted) {
|
|
260
|
+
const vramNeeded = GPU.estimateVRAM(artifact.sizeBytes, artifact.quantization)
|
|
261
|
+
|
|
262
|
+
// Find GPU with most remaining space
|
|
263
|
+
let bestGpu: number | undefined
|
|
264
|
+
let bestFree = 0
|
|
265
|
+
for (const [idx, used] of gpuAlloc) {
|
|
266
|
+
const gpu = gpuProfile.gpus.find((g) => g.index === idx)
|
|
267
|
+
if (!gpu) continue
|
|
268
|
+
const free = gpu.vramTotalMB - used
|
|
269
|
+
if (free >= vramNeeded && free > bestFree) {
|
|
270
|
+
bestGpu = idx
|
|
271
|
+
bestFree = free
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
if (bestGpu !== undefined) {
|
|
276
|
+
gpuAlloc.set(bestGpu, (gpuAlloc.get(bestGpu) ?? 0) + vramNeeded)
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
try {
|
|
280
|
+
const instance = await load(artifact, { gpuIndex: bestGpu })
|
|
281
|
+
results.push(instance)
|
|
282
|
+
} catch (e) {
|
|
283
|
+
log.error("failed to load model in batch", { repo: artifact.repo, error: e })
|
|
284
|
+
results.push({
|
|
285
|
+
id: `error-${artifact.repo}`,
|
|
286
|
+
artifact,
|
|
287
|
+
status: "error",
|
|
288
|
+
error: e instanceof Error ? e.message : String(e),
|
|
289
|
+
})
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return results
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/** Unload all running models */
|
|
297
|
+
export async function unloadAll(): Promise<void> {
|
|
298
|
+
for (const id of [...running.keys()]) {
|
|
299
|
+
await unload(id)
|
|
300
|
+
}
|
|
301
|
+
}
|