saeeol 1.0.9 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/npm/bin/saeeol +42 -0
  2. package/npm/package.json +39 -0
  3. package/npm/postinstall.js +162 -0
  4. package/package.json +2 -2
  5. package/src/cli/cmd/mcp-refresh.ts +47 -0
  6. package/src/cli/cmd/mcp.ts +3 -1
  7. package/src/cli/cmd/tui/app-commands-core.tsx +11 -0
  8. package/src/cli/cmd/tui/app-commands-system.tsx +20 -0
  9. package/src/cli/cmd/tui/app-events.ts +43 -0
  10. package/src/cli/cmd/tui/app.tsx +4 -0
  11. package/src/cli/cmd/tui/component/dialog-model.tsx +2 -2
  12. package/src/cli/cmd/tui/component/prompt/use-prompt-memos.ts +1 -1
  13. package/src/cli/cmd/tui/component/use-connected.tsx +1 -1
  14. package/src/cli/cmd/tui/context/local.tsx +10 -3
  15. package/src/cli/cmd/tui/context/route.tsx +5 -1
  16. package/src/cli/cmd/tui/feature-plugins/sidebar/context.tsx +1 -1
  17. package/src/cli/cmd/tui/plugin/api.tsx +7 -3
  18. package/src/cli/cmd/tui/routes/local-models.tsx +151 -0
  19. package/src/cli/cmd/tui/routes/session/subagent-footer.tsx +1 -1
  20. package/src/cli/cmd/tui/util/model.ts +1 -1
  21. package/src/config/config-schema.ts +44 -0
  22. package/src/ltm/config.ts +124 -0
  23. package/src/ltm/events.ts +50 -0
  24. package/src/ltm/index.ts +12 -0
  25. package/src/ltm/memory/episodic.ts +83 -0
  26. package/src/ltm/memory/procedural.ts +102 -0
  27. package/src/ltm/memory/semantic.ts +80 -0
  28. package/src/ltm/pipeline.ts +155 -0
  29. package/src/ltm/retrieval.ts +62 -0
  30. package/src/ltm/scheduler.ts +55 -0
  31. package/src/ltm/store.ts +150 -0
  32. package/src/ltm/types.ts +108 -0
  33. package/src/mcp/index.ts +32 -1
  34. package/src/provider/custom-loaders.ts +12 -0
  35. package/src/provider/loader-local.ts +185 -0
  36. package/src/provider/local/embedder.ts +220 -0
  37. package/src/provider/local/events.ts +74 -0
  38. package/src/provider/local/gpu.ts +93 -0
  39. package/src/provider/local/hub.ts +174 -0
  40. package/src/provider/local/index.ts +10 -0
  41. package/src/provider/local/model-manager.ts +113 -0
  42. package/src/provider/local/orchestrator.ts +301 -0
  43. package/src/provider/local/rag.ts +112 -0
  44. package/src/provider/local/types.ts +142 -0
  45. package/src/provider/provider-conversion.ts +2 -0
  46. package/src/provider/provider-schema.ts +17 -2
  47. package/src/provider/provider-schemas.ts +10 -3
  48. package/src/provider/provider-state.ts +10 -2
  49. package/src/provider/provider.ts +2 -1
  50. package/src/saeeol/plugins/sidebar-usage.tsx +1 -1
  51. package/src/server/routes/instance/config.ts +1 -1
  52. package/src/server/routes/instance/httpapi/api.ts +2 -0
  53. package/src/server/routes/instance/httpapi/groups/local.ts +87 -0
  54. package/src/server/routes/instance/httpapi/groups/mcp.ts +10 -0
  55. package/src/server/routes/instance/httpapi/handlers/local.ts +95 -0
  56. package/src/server/routes/instance/httpapi/handlers/mcp.ts +5 -0
  57. package/src/server/routes/instance/httpapi/handlers/provider.ts +1 -1
  58. package/src/server/routes/instance/httpapi/server.ts +2 -0
  59. package/src/server/routes/instance/provider.ts +2 -2
  60. package/src/session/prompt-reminders.ts +29 -0
  61. package/test/fake/provider.ts +1 -0
  62. package/test/provider/local.test.ts +208 -0
  63. package/test/provider/provider-category.test.ts +190 -0
@@ -0,0 +1,185 @@
1
+ /** Local provider loaders — Ollama, LM Studio, vLLM, llama.cpp, text-generation-webui */
2
+
3
+ import os from "os"
4
+ import { Effect } from "effect"
5
+ import { InstallationVersion } from "@saeeol/core/installation/version"
6
+ import type { CustomDep, CustomLoader } from "./provider-types"
7
+ import type { Info } from "./provider-schema"
8
+ import { useLanguageModel } from "./bundled-providers"
9
+ import { iife } from "@/util/iife"
10
+
11
+ function ua(name: string): string {
12
+ return `saeeol/${InstallationVersion} ${name} (${os.platform()} ${os.release()}; ${os.arch()})`
13
+ }
14
+
15
+ export function localLoaders(dep: CustomDep): Record<string, CustomLoader> {
16
+ return {
17
+ ollama: Effect.fnUntraced(function* (_provider: Info) {
18
+ const env = yield* dep.env()
19
+ const endpoint = iife(() => {
20
+ return [
21
+ _provider.options?.baseURL,
22
+ env["OLLAMA_BASE_URL"],
23
+ env["OLLAMA_HOST"],
24
+ ].find((url) => typeof url === "string" && url.trim() !== "")
25
+ }) ?? "http://localhost:11434"
26
+
27
+ return {
28
+ autoload: true,
29
+ options: {
30
+ baseURL: endpoint,
31
+ headers: { "User-Agent": ua("ollama") },
32
+ },
33
+ vars(): Record<string, string> {
34
+ return { OLLAMA_BASE_URL: endpoint }
35
+ },
36
+ async getModel(sdk: any, modelID: string, _options?: Record<string, any>) {
37
+ if (useLanguageModel(sdk)) return sdk.languageModel(modelID)
38
+ return sdk.chat(modelID)
39
+ },
40
+ async discoverModels() {
41
+ try {
42
+ const res = await fetch(`${endpoint}/api/tags`, { signal: AbortSignal.timeout(5000) })
43
+ if (!res.ok) return {}
44
+ const data = (await res.json()) as { models?: Array<{ name: string }> }
45
+ const models: Record<string, any> = {}
46
+ for (const m of data.models ?? []) {
47
+ models[m.name] = {
48
+ id: m.name, name: m.name, providerID: "ollama",
49
+ api: { id: m.name, url: `${endpoint}/v1`, npm: "@ai-sdk/openai-compatible" },
50
+ capabilities: { temperature: true, reasoning: false, attachment: false, toolcall: true, input: { text: true, image: false, audio: false, video: false, pdf: false }, output: { text: true, image: false, audio: false, video: false, pdf: false }, interleaved: false },
51
+ cost: { input: 0, output: 0, cache: { read: 0, write: 0 } },
52
+ limit: { context: 32768, output: 4096 },
53
+ status: "active" as const, options: {}, headers: {}, release_date: "",
54
+ }
55
+ }
56
+ return models
57
+ } catch {
58
+ return {}
59
+ }
60
+ },
61
+ }
62
+ }),
63
+
64
+ lmstudio: Effect.fnUntraced(function* (_provider: Info) {
65
+ const env = yield* dep.env()
66
+ const endpoint = iife(() => {
67
+ return [
68
+ _provider.options?.baseURL,
69
+ env["LMSTUDIO_BASE_URL"],
70
+ ].find((url) => typeof url === "string" && url.trim() !== "")
71
+ }) ?? "http://localhost:1234/v1"
72
+
73
+ return {
74
+ autoload: true,
75
+ options: {
76
+ baseURL: endpoint,
77
+ headers: { "User-Agent": ua("lmstudio") },
78
+ },
79
+ vars(): Record<string, string> {
80
+ return { LMSTUDIO_BASE_URL: endpoint }
81
+ },
82
+ async getModel(sdk: any, modelID: string, _options?: Record<string, any>) {
83
+ if (useLanguageModel(sdk)) return sdk.languageModel(modelID)
84
+ return sdk.chat(modelID)
85
+ },
86
+ async discoverModels() {
87
+ try {
88
+ const res = await fetch(`${endpoint}/models`, { signal: AbortSignal.timeout(5000) })
89
+ if (!res.ok) return {}
90
+ const data = (await res.json()) as { data?: Array<{ id: string }> }
91
+ const models: Record<string, any> = {}
92
+ for (const m of data.data ?? []) {
93
+ models[m.id] = {
94
+ id: m.id, name: m.id, providerID: "lmstudio",
95
+ api: { id: m.id, url: endpoint, npm: "@ai-sdk/openai-compatible" },
96
+ capabilities: { temperature: true, reasoning: false, attachment: false, toolcall: true, input: { text: true, image: false, audio: false, video: false, pdf: false }, output: { text: true, image: false, audio: false, video: false, pdf: false }, interleaved: false },
97
+ cost: { input: 0, output: 0, cache: { read: 0, write: 0 } },
98
+ limit: { context: 32768, output: 4096 },
99
+ status: "active" as const, options: {}, headers: {}, release_date: "",
100
+ }
101
+ }
102
+ return models
103
+ } catch {
104
+ return {}
105
+ }
106
+ },
107
+ }
108
+ }),
109
+
110
+ vllm: Effect.fnUntraced(function* (_provider: Info) {
111
+ const env = yield* dep.env()
112
+ const endpoint = iife(() => {
113
+ return [
114
+ _provider.options?.baseURL,
115
+ env["VLLM_BASE_URL"],
116
+ ].find((url) => typeof url === "string" && url.trim() !== "")
117
+ }) ?? "http://localhost:8000/v1"
118
+
119
+ return {
120
+ autoload: true,
121
+ options: {
122
+ baseURL: endpoint,
123
+ headers: { "User-Agent": ua("vllm") },
124
+ },
125
+ vars(): Record<string, string> {
126
+ return { VLLM_BASE_URL: endpoint }
127
+ },
128
+ async getModel(sdk: any, modelID: string, _options?: Record<string, any>) {
129
+ if (useLanguageModel(sdk)) return sdk.languageModel(modelID)
130
+ return sdk.chat(modelID)
131
+ },
132
+ }
133
+ }),
134
+
135
+ "text-generation-webui": Effect.fnUntraced(function* (_provider: Info) {
136
+ const env = yield* dep.env()
137
+ const endpoint = iife(() => {
138
+ return [
139
+ _provider.options?.baseURL,
140
+ env["TEXT_GEN_WEBUI_BASE_URL"],
141
+ ].find((url) => typeof url === "string" && url.trim() !== "")
142
+ }) ?? "http://localhost:5000"
143
+
144
+ return {
145
+ autoload: true,
146
+ options: {
147
+ baseURL: endpoint,
148
+ headers: { "User-Agent": ua("text-generation-webui") },
149
+ },
150
+ vars(): Record<string, string> {
151
+ return { TEXT_GEN_WEBUI_BASE_URL: endpoint }
152
+ },
153
+ async getModel(sdk: any, modelID: string, _options?: Record<string, any>) {
154
+ if (useLanguageModel(sdk)) return sdk.languageModel(modelID)
155
+ return sdk.chat(modelID)
156
+ },
157
+ }
158
+ }),
159
+
160
+ "llama.cpp": Effect.fnUntraced(function* (_provider: Info) {
161
+ const env = yield* dep.env()
162
+ const endpoint = iife(() => {
163
+ return [
164
+ _provider.options?.baseURL,
165
+ env["LLAMA_CPP_BASE_URL"],
166
+ ].find((url) => typeof url === "string" && url.trim() !== "")
167
+ }) ?? "http://localhost:8080"
168
+
169
+ return {
170
+ autoload: true,
171
+ options: {
172
+ baseURL: endpoint,
173
+ headers: { "User-Agent": ua("llama.cpp") },
174
+ },
175
+ vars(): Record<string, string> {
176
+ return { LLAMA_CPP_BASE_URL: endpoint }
177
+ },
178
+ async getModel(sdk: any, modelID: string, _options?: Record<string, any>) {
179
+ if (useLanguageModel(sdk)) return sdk.languageModel(modelID)
180
+ return sdk.chat(modelID)
181
+ },
182
+ }
183
+ }),
184
+ }
185
+ }
@@ -0,0 +1,220 @@
1
+ /** 로컬 임베딩 서버 — Ollama 기반 임베딩 모델 실행 관리 */
2
+
3
+ import { Effect } from "effect"
4
+ import * as Log from "@saeeol/core/util/log"
5
+ import { Process } from "@/util/process"
6
+ import type { EmbeddingServer, EmbedderStatus, HardwareProfile, LLMBakeParams } from "@/ltm/types"
7
+ import type { BackendType } from "@/provider/local/types"
8
+ import * as GPU from "@/provider/local/gpu"
9
+ import * as RAG from "@/provider/local/rag"
10
+ import * as Hub from "@/provider/local/hub"
11
+ import * as Manager from "@/provider/local/model-manager"
12
+ import { iife } from "@/util/iife"
13
+ import * as Bus from "@/bus"
14
+ import { LTMEvent } from "@/ltm/events"
15
+
16
+ const log = Log.create({ service: "local/embedder" })
17
+
18
+ let server: EmbeddingServer | undefined
19
+
20
+ // ── Ollama 상태 확인 ──
21
+
22
+ async function isOllamaRunning(endpoint: string): Promise<boolean> {
23
+ try {
24
+ const res = await fetch(`${endpoint}/api/tags`, { signal: AbortSignal.timeout(3000) })
25
+ return res.ok
26
+ } catch {
27
+ return false
28
+ }
29
+ }
30
+
31
+ async function isModelLoaded(endpoint: string, model: string): Promise<boolean> {
32
+ try {
33
+ const res = await fetch(`${endpoint}/api/tags`, { signal: AbortSignal.timeout(3000) })
34
+ if (!res.ok) return false
35
+ const data = (await res.json()) as { models?: Array<{ name: string }> }
36
+ return (data.models ?? []).some((m) => m.name === model || m.name === `${model}:latest`)
37
+ } catch {
38
+ return false
39
+ }
40
+ }
41
+
42
+ /** Ollama에 임베딩 모델이 설치되어 있는지 확인 */
43
+ async function isModelInstalled(endpoint: string, model: string): Promise<boolean> {
44
+ try {
45
+ const res = await fetch(`${endpoint}/api/tags`, { signal: AbortSignal.timeout(5000) })
46
+ if (!res.ok) return false
47
+ const data = (await res.json()) as { models?: Array<{ name: string }> }
48
+ return (data.models ?? []).some((m) =>
49
+ m.name === model || m.name === `${model}:latest` || m.name.replace(":latest", "") === model,
50
+ )
51
+ } catch {
52
+ return false
53
+ }
54
+ }
55
+
56
+ // ── Ollama에 모델 pull ──
57
+
58
+ async function pullModel(endpoint: string, model: string): Promise<void> {
59
+ log.info("pulling embedding model", { model })
60
+ const ollamaModel = getOllamaModelName(model)
61
+
62
+ try {
63
+ const res = await fetch(`${endpoint}/api/pull`, {
64
+ method: "POST",
65
+ headers: { "Content-Type": "application/json" },
66
+ body: JSON.stringify({ name: ollamaModel, stream: false }),
67
+ signal: AbortSignal.timeout(300_000), // 5분 타임아웃
68
+ })
69
+ if (!res.ok) {
70
+ const err = await res.text().catch(() => "unknown error")
71
+ throw new Error(`Ollama pull failed: ${res.status} ${err}`)
72
+ }
73
+ log.info("model pulled", { model: ollamaModel })
74
+ } catch (e) {
75
+ log.error("failed to pull model", { model: ollamaModel, error: e })
76
+ throw e
77
+ }
78
+ }
79
+
80
+ /** 내부 모델 ID → Ollama 모델명 매핑 */
81
+ function getOllamaModelName(modelId: string): string {
82
+ const map: Record<string, string> = {
83
+ "bge-small-en": "nomic-embed-text", // Ollama에 가장 가벼운 임베딩
84
+ "bge-base-en": "nomic-embed-text",
85
+ "bge-large-en": "nomic-embed-text",
86
+ "bge-m3": "bge-m3",
87
+ "nomic-embed": "nomic-embed-text",
88
+ "all-minilm-l6": "all-minilm",
89
+ "gte-small": "nomic-embed-text",
90
+ }
91
+ return map[modelId] ?? modelId
92
+ }
93
+
94
+ // ── 임베딩 API 호출 ──
95
+
96
+ async function embedViaOllama(endpoint: string, model: string, texts: string[]): Promise<number[][]> {
97
+ const ollamaModel = getOllamaModelName(model)
98
+ const res = await fetch(`${endpoint}/api/embed`, {
99
+ method: "POST",
100
+ headers: { "Content-Type": "application/json" },
101
+ body: JSON.stringify({ model: ollamaModel, input: texts }),
102
+ signal: AbortSignal.timeout(30_000),
103
+ })
104
+
105
+ if (!res.ok) {
106
+ const err = await res.text().catch(() => "unknown error")
107
+ throw new Error(`Embedding failed: ${res.status} ${err}`)
108
+ }
109
+
110
+ const data = (await res.json()) as { embeddings?: number[][] }
111
+ if (!data.embeddings || !Array.isArray(data.embeddings)) {
112
+ throw new Error("Invalid embedding response")
113
+ }
114
+ return data.embeddings
115
+ }
116
+
117
+ // ── VRAM 추정 ──
118
+
119
+ function estimateEmbeddingVRAM(modelId: string): number {
120
+ const model = RAG.EMBEDDING_MODELS.find((m) => m.id === modelId)
121
+ if (!model) return 200
122
+ return Math.ceil(model.sizeBytes * 1.2 / (1024 * 1024))
123
+ }
124
+
125
+ // ── 공개 API ──
126
+
127
+ /** 임베딩 서버 시작 */
128
+ export async function start(bake: LLMBakeParams): Promise<EmbeddingServer> {
129
+ if (server && server.status === "running") {
130
+ log.info("embedder already running", { model: server.model })
131
+ return server
132
+ }
133
+
134
+ const endpoint = iife(() => {
135
+ const env = process.env.OLLAMA_BASE_URL ?? process.env.OLLAMA_HOST
136
+ return env ?? "http://localhost:11434"
137
+ })
138
+
139
+ server = {
140
+ id: "local-embedding",
141
+ model: bake.embeddingModel,
142
+ status: "starting",
143
+ endpoint,
144
+ dimensions: bake.embeddingDimensions,
145
+ vramMB: estimateEmbeddingVRAM(bake.embeddingModel),
146
+ }
147
+
148
+ void Bus.publish(LTMEvent.EmbedderStatusChanged, { status: "starting", model: bake.embeddingModel })
149
+
150
+ // Ollama 실행 확인
151
+ const running = await isOllamaRunning(endpoint)
152
+ if (!running) {
153
+ log.warn("Ollama not running, embedding server unavailable", { endpoint })
154
+ server.status = "error"
155
+ void Bus.publish(LTMEvent.EmbedderStatusChanged, { status: "error", model: bake.embeddingModel })
156
+ return server
157
+ }
158
+
159
+ // 모델 설치 확인
160
+ const ollamaModel = getOllamaModelName(bake.embeddingModel)
161
+ const installed = await isModelInstalled(endpoint, ollamaModel)
162
+ if (!installed) {
163
+ log.info("model not installed, pulling", { model: ollamaModel })
164
+ await pullModel(endpoint, ollamaModel)
165
+ }
166
+
167
+ // 모델 로드 확인 (첫 호출 시 자동 로드됨)
168
+ const loaded = await isModelLoaded(endpoint, ollamaModel)
169
+ if (!loaded) {
170
+ // 웜업 호출 — 모델을 메모리에 로드
171
+ try {
172
+ await embedViaOllama(endpoint, bake.embeddingModel, ["warmup"])
173
+ log.info("model warmed up", { model: ollamaModel })
174
+ } catch (e) {
175
+ log.error("warmup failed", { model: ollamaModel, error: e })
176
+ server.status = "error"
177
+ void Bus.publish(LTMEvent.EmbedderStatusChanged, { status: "error", model: bake.embeddingModel })
178
+ return server
179
+ }
180
+ }
181
+
182
+ server.status = "running"
183
+ void Bus.publish(LTMEvent.EmbedderStatusChanged, { status: "running", model: bake.embeddingModel })
184
+ log.info("embedding server started", { model: bake.embeddingModel, dimensions: bake.embeddingDimensions })
185
+
186
+ return server
187
+ }
188
+
189
+ /** 임베딩 서버 중지 */
190
+ export async function stop(): Promise<void> {
191
+ if (!server) return
192
+ server.status = "stopped"
193
+ void Bus.publish(LTMEvent.EmbedderStatusChanged, { status: "stopped" })
194
+ log.info("embedding server stopped")
195
+ server = undefined
196
+ }
197
+
198
+ /** 현재 상태 */
199
+ export function status(): EmbeddingServer | undefined {
200
+ return server
201
+ }
202
+
203
+ /** 텍스트 배열 → 임베딩 벡터 */
204
+ export async function embed(texts: string[]): Promise<number[][]> {
205
+ if (!server || server.status !== "running") {
206
+ throw new Error("Embedding server not running")
207
+ }
208
+ return embedViaOllama(server.endpoint, server.model, texts)
209
+ }
210
+
211
+ /** 단일 텍스트 → 임베딩 벡터 */
212
+ export async function embedOne(text: string): Promise<number[]> {
213
+ const vectors = await embed([text])
214
+ return vectors[0]!
215
+ }
216
+
217
+ /** VRAM 사용량 (MB) */
218
+ export function vramUsage(): number {
219
+ return server?.vramMB ?? 0
220
+ }
@@ -0,0 +1,74 @@
1
+ /** Local model events — TUI/webview subscription for progress/status updates */
2
+
3
+ import { BusEvent } from "@/bus/bus-event"
4
+ import { Schema } from "effect"
5
+
6
+ export const LocalModelEvent = {
7
+ DownloadStarted: BusEvent.define(
8
+ "local.download.started",
9
+ Schema.Struct({
10
+ id: Schema.String,
11
+ repo: Schema.String,
12
+ filename: Schema.String,
13
+ totalBytes: Schema.Number,
14
+ }),
15
+ ),
16
+ DownloadProgress: BusEvent.define(
17
+ "local.download.progress",
18
+ Schema.Struct({
19
+ id: Schema.String,
20
+ downloaded: Schema.Number,
21
+ total: Schema.Number,
22
+ speedMBps: Schema.Finite,
23
+ }),
24
+ ),
25
+ DownloadCompleted: BusEvent.define(
26
+ "local.download.completed",
27
+ Schema.Struct({
28
+ id: Schema.String,
29
+ repo: Schema.String,
30
+ filename: Schema.String,
31
+ path: Schema.String,
32
+ }),
33
+ ),
34
+ DownloadFailed: BusEvent.define(
35
+ "local.download.failed",
36
+ Schema.Struct({
37
+ id: Schema.String,
38
+ repo: Schema.String,
39
+ error: Schema.String,
40
+ }),
41
+ ),
42
+ ModelStarted: BusEvent.define(
43
+ "local.model.started",
44
+ Schema.Struct({
45
+ id: Schema.String,
46
+ repo: Schema.String,
47
+ endpoint: Schema.String,
48
+ gpuIndex: Schema.optional(Schema.Number),
49
+ }),
50
+ ),
51
+ ModelStopped: BusEvent.define(
52
+ "local.model.stopped",
53
+ Schema.Struct({
54
+ id: Schema.String,
55
+ repo: Schema.String,
56
+ }),
57
+ ),
58
+ ModelError: BusEvent.define(
59
+ "local.model.error",
60
+ Schema.Struct({
61
+ id: Schema.String,
62
+ repo: Schema.String,
63
+ error: Schema.String,
64
+ }),
65
+ ),
66
+ GPUProfiled: BusEvent.define(
67
+ "local.gpu.profiled",
68
+ Schema.Struct({
69
+ gpuCount: Schema.Number,
70
+ totalVRAMMB: Schema.Number,
71
+ availableVRAMMB: Schema.Number,
72
+ }),
73
+ ),
74
+ }
@@ -0,0 +1,93 @@
1
+ /** GPU profiler — detect hardware capabilities for model scheduling */
2
+
3
+ import { Effect } from "effect"
4
+ import * as Log from "@saeeol/core/util/log"
5
+ import { Process } from "@/util/process"
6
+ import type { GPUInfo, GPUProfile } from "./types"
7
+
8
+ const log = Log.create({ service: "local/gpu" })
9
+
10
+ /** Parse nvidia-smi output into structured GPU info */
11
+ function parseNvidiaSmi(stdout: string): GPUInfo[] {
12
+ const gpus: GPUInfo[] = []
13
+ // nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,compute_cap,driver_version,cuda_version --format=csv,noheader,nounits
14
+ for (const line of stdout.trim().split("\n")) {
15
+ const parts = line.split(",").map((s) => s.trim())
16
+ if (parts.length < 5) continue
17
+ gpus.push({
18
+ index: Number(parts[0]),
19
+ name: parts[1],
20
+ vramTotalMB: Number(parts[2]),
21
+ vramUsedMB: Number(parts[3]),
22
+ vramFreeMB: Number(parts[4]),
23
+ computeCapability: parts[5] || undefined,
24
+ driverVersion: parts[6] || undefined,
25
+ cudaVersion: parts[7] || undefined,
26
+ })
27
+ }
28
+ return gpus
29
+ }
30
+
31
+ /** Profile all available GPUs via nvidia-smi */
32
+ export const profile: Effect.Effect<GPUProfile> = Effect.gen(function* () {
33
+ const result = yield* Effect.tryPromise(() =>
34
+ Process.run([
35
+ "nvidia-smi",
36
+ "--query-gpu=index,name,memory.total,memory.used,memory.free,compute_cap,driver_version,cuda_version",
37
+ "--format=csv,noheader,nounits",
38
+ ]),
39
+ ).pipe(Effect.catch(() => Effect.succeed({ code: 1, stdout: Buffer.alloc(0), stderr: Buffer.alloc(0) } as Process.Result)))
40
+
41
+ if (result.code !== 0) {
42
+ log.info("no nvidia-smi found, GPU profiling unavailable")
43
+ return {
44
+ gpus: [],
45
+ totalVRAMMB: 0,
46
+ availableVRAMMB: 0,
47
+ cudaAvailable: false,
48
+ }
49
+ }
50
+
51
+ const gpus = parseNvidiaSmi(result.stdout.toString())
52
+ return {
53
+ gpus,
54
+ totalVRAMMB: gpus.reduce((sum, g) => sum + g.vramTotalMB, 0),
55
+ availableVRAMMB: gpus.reduce((sum, g) => sum + g.vramFreeMB, 0),
56
+ cudaAvailable: gpus.length > 0,
57
+ }
58
+ })
59
+
60
+ /** Estimate VRAM needed for a model (bytes) at a given quantization */
61
+ export function estimateVRAM(modelBytes: number, quantization: string): number {
62
+ // Runtime overhead: ~20% for KV cache, context, etc.
63
+ // GGUF files are already quantized — model file size ≈ VRAM needed + 20%
64
+ const overhead = modelBytes * 0.2
65
+ return Math.ceil((modelBytes + overhead) / (1024 * 1024)) // return MB
66
+ }
67
+
68
+ /** Find the best GPU for a model that needs `vramMB` megabytes */
69
+ export function findBestGPU(profile: GPUProfile, vramMB: number): GPUInfo | undefined {
70
+ // Prefer GPU with most free VRAM that can fit the model
71
+ const candidates = profile.gpus.filter((g) => g.vramFreeMB >= vramMB)
72
+ if (candidates.length === 0) return undefined
73
+ return candidates.sort((a, b) => b.vramFreeMB - a.vramFreeMB)[0]
74
+ }
75
+
76
+ /** Check if a model fits in available GPU memory */
77
+ export function canFit(profile: GPUProfile, vramMB: number): boolean {
78
+ return profile.availableVRAMMB >= vramMB
79
+ }
80
+
81
+ /** Suggest quantization level based on available VRAM */
82
+ export function suggestQuantization(
83
+ availableMB: number,
84
+ modelSizeBytes: Record<string, number>,
85
+ ): string | undefined {
86
+ // Try from highest quality to lowest
87
+ const order = ["fp16", "q8_0", "q6_k", "q5_k_m", "q5_k_s", "q4_k_m", "q4_k_s", "q3_k_m", "q3_k_s", "q2_k"]
88
+ for (const q of order) {
89
+ const size = modelSizeBytes[q]
90
+ if (size !== undefined && estimateVRAM(size, q) <= availableMB) return q
91
+ }
92
+ return undefined
93
+ }