inference-server 1.0.0-beta.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. package/README.md +216 -0
  2. package/dist/api/openai/enums.d.ts +4 -0
  3. package/dist/api/openai/enums.js +17 -0
  4. package/dist/api/openai/enums.js.map +1 -0
  5. package/dist/api/openai/handlers/chat.d.ts +3 -0
  6. package/dist/api/openai/handlers/chat.js +358 -0
  7. package/dist/api/openai/handlers/chat.js.map +1 -0
  8. package/dist/api/openai/handlers/completions.d.ts +3 -0
  9. package/dist/api/openai/handlers/completions.js +169 -0
  10. package/dist/api/openai/handlers/completions.js.map +1 -0
  11. package/dist/api/openai/handlers/embeddings.d.ts +3 -0
  12. package/dist/api/openai/handlers/embeddings.js +74 -0
  13. package/dist/api/openai/handlers/embeddings.js.map +1 -0
  14. package/dist/api/openai/handlers/images.d.ts +0 -0
  15. package/dist/api/openai/handlers/images.js +4 -0
  16. package/dist/api/openai/handlers/images.js.map +1 -0
  17. package/dist/api/openai/handlers/models.d.ts +3 -0
  18. package/dist/api/openai/handlers/models.js +23 -0
  19. package/dist/api/openai/handlers/models.js.map +1 -0
  20. package/dist/api/openai/handlers/transcription.d.ts +0 -0
  21. package/dist/api/openai/handlers/transcription.js +4 -0
  22. package/dist/api/openai/handlers/transcription.js.map +1 -0
  23. package/dist/api/openai/index.d.ts +7 -0
  24. package/dist/api/openai/index.js +14 -0
  25. package/dist/api/openai/index.js.map +1 -0
  26. package/dist/api/parseJSONRequestBody.d.ts +2 -0
  27. package/dist/api/parseJSONRequestBody.js +24 -0
  28. package/dist/api/parseJSONRequestBody.js.map +1 -0
  29. package/dist/api/v1/index.d.ts +2 -0
  30. package/dist/api/v1/index.js +29 -0
  31. package/dist/api/v1/index.js.map +1 -0
  32. package/dist/cli.d.ts +1 -0
  33. package/dist/cli.js +10 -0
  34. package/dist/cli.js.map +1 -0
  35. package/dist/engines/gpt4all/engine.d.ts +34 -0
  36. package/dist/engines/gpt4all/engine.js +357 -0
  37. package/dist/engines/gpt4all/engine.js.map +1 -0
  38. package/dist/engines/gpt4all/util.d.ts +3 -0
  39. package/dist/engines/gpt4all/util.js +29 -0
  40. package/dist/engines/gpt4all/util.js.map +1 -0
  41. package/dist/engines/index.d.ts +19 -0
  42. package/dist/engines/index.js +21 -0
  43. package/dist/engines/index.js.map +1 -0
  44. package/dist/engines/node-llama-cpp/engine.d.ts +49 -0
  45. package/dist/engines/node-llama-cpp/engine.js +666 -0
  46. package/dist/engines/node-llama-cpp/engine.js.map +1 -0
  47. package/dist/engines/node-llama-cpp/types.d.ts +13 -0
  48. package/dist/engines/node-llama-cpp/types.js +2 -0
  49. package/dist/engines/node-llama-cpp/types.js.map +1 -0
  50. package/dist/engines/node-llama-cpp/util.d.ts +15 -0
  51. package/dist/engines/node-llama-cpp/util.js +84 -0
  52. package/dist/engines/node-llama-cpp/util.js.map +1 -0
  53. package/dist/engines/node-llama-cpp/validateModelFile.d.ts +8 -0
  54. package/dist/engines/node-llama-cpp/validateModelFile.js +36 -0
  55. package/dist/engines/node-llama-cpp/validateModelFile.js.map +1 -0
  56. package/dist/engines/stable-diffusion-cpp/engine.d.ts +90 -0
  57. package/dist/engines/stable-diffusion-cpp/engine.js +294 -0
  58. package/dist/engines/stable-diffusion-cpp/engine.js.map +1 -0
  59. package/dist/engines/stable-diffusion-cpp/types.d.ts +3 -0
  60. package/dist/engines/stable-diffusion-cpp/types.js +2 -0
  61. package/dist/engines/stable-diffusion-cpp/types.js.map +1 -0
  62. package/dist/engines/stable-diffusion-cpp/util.d.ts +4 -0
  63. package/dist/engines/stable-diffusion-cpp/util.js +55 -0
  64. package/dist/engines/stable-diffusion-cpp/util.js.map +1 -0
  65. package/dist/engines/stable-diffusion-cpp/validateModelFiles.d.ts +19 -0
  66. package/dist/engines/stable-diffusion-cpp/validateModelFiles.js +91 -0
  67. package/dist/engines/stable-diffusion-cpp/validateModelFiles.js.map +1 -0
  68. package/dist/engines/transformers-js/engine.d.ts +37 -0
  69. package/dist/engines/transformers-js/engine.js +538 -0
  70. package/dist/engines/transformers-js/engine.js.map +1 -0
  71. package/dist/engines/transformers-js/types.d.ts +7 -0
  72. package/dist/engines/transformers-js/types.js +2 -0
  73. package/dist/engines/transformers-js/types.js.map +1 -0
  74. package/dist/engines/transformers-js/util.d.ts +7 -0
  75. package/dist/engines/transformers-js/util.js +36 -0
  76. package/dist/engines/transformers-js/util.js.map +1 -0
  77. package/dist/engines/transformers-js/validateModelFiles.d.ts +17 -0
  78. package/dist/engines/transformers-js/validateModelFiles.js +133 -0
  79. package/dist/engines/transformers-js/validateModelFiles.js.map +1 -0
  80. package/dist/experiments/ChatWithVision.d.ts +11 -0
  81. package/dist/experiments/ChatWithVision.js +91 -0
  82. package/dist/experiments/ChatWithVision.js.map +1 -0
  83. package/dist/experiments/StableDiffPromptGenerator.d.ts +0 -0
  84. package/dist/experiments/StableDiffPromptGenerator.js +4 -0
  85. package/dist/experiments/StableDiffPromptGenerator.js.map +1 -0
  86. package/dist/experiments/VoiceFunctionCall.d.ts +18 -0
  87. package/dist/experiments/VoiceFunctionCall.js +51 -0
  88. package/dist/experiments/VoiceFunctionCall.js.map +1 -0
  89. package/dist/http.d.ts +19 -0
  90. package/dist/http.js +54 -0
  91. package/dist/http.js.map +1 -0
  92. package/dist/index.d.ts +7 -0
  93. package/dist/index.js +8 -0
  94. package/dist/index.js.map +1 -0
  95. package/dist/instance.d.ts +88 -0
  96. package/dist/instance.js +594 -0
  97. package/dist/instance.js.map +1 -0
  98. package/dist/lib/acquireFileLock.d.ts +7 -0
  99. package/dist/lib/acquireFileLock.js +38 -0
  100. package/dist/lib/acquireFileLock.js.map +1 -0
  101. package/dist/lib/calculateContextIdentity.d.ts +7 -0
  102. package/dist/lib/calculateContextIdentity.js +39 -0
  103. package/dist/lib/calculateContextIdentity.js.map +1 -0
  104. package/dist/lib/calculateFileChecksum.d.ts +1 -0
  105. package/dist/lib/calculateFileChecksum.js +16 -0
  106. package/dist/lib/calculateFileChecksum.js.map +1 -0
  107. package/dist/lib/copyDirectory.d.ts +6 -0
  108. package/dist/lib/copyDirectory.js +27 -0
  109. package/dist/lib/copyDirectory.js.map +1 -0
  110. package/dist/lib/decodeAudio.d.ts +1 -0
  111. package/dist/lib/decodeAudio.js +26 -0
  112. package/dist/lib/decodeAudio.js.map +1 -0
  113. package/dist/lib/downloadModelFile.d.ts +10 -0
  114. package/dist/lib/downloadModelFile.js +58 -0
  115. package/dist/lib/downloadModelFile.js.map +1 -0
  116. package/dist/lib/flattenMessageTextContent.d.ts +2 -0
  117. package/dist/lib/flattenMessageTextContent.js +11 -0
  118. package/dist/lib/flattenMessageTextContent.js.map +1 -0
  119. package/dist/lib/getCacheDirPath.d.ts +12 -0
  120. package/dist/lib/getCacheDirPath.js +31 -0
  121. package/dist/lib/getCacheDirPath.js.map +1 -0
  122. package/dist/lib/loadImage.d.ts +12 -0
  123. package/dist/lib/loadImage.js +30 -0
  124. package/dist/lib/loadImage.js.map +1 -0
  125. package/dist/lib/logger.d.ts +12 -0
  126. package/dist/lib/logger.js +98 -0
  127. package/dist/lib/logger.js.map +1 -0
  128. package/dist/lib/math.d.ts +7 -0
  129. package/dist/lib/math.js +30 -0
  130. package/dist/lib/math.js.map +1 -0
  131. package/dist/lib/resolveModelFileLocation.d.ts +15 -0
  132. package/dist/lib/resolveModelFileLocation.js +41 -0
  133. package/dist/lib/resolveModelFileLocation.js.map +1 -0
  134. package/dist/lib/util.d.ts +7 -0
  135. package/dist/lib/util.js +61 -0
  136. package/dist/lib/util.js.map +1 -0
  137. package/dist/lib/validateModelFile.d.ts +9 -0
  138. package/dist/lib/validateModelFile.js +62 -0
  139. package/dist/lib/validateModelFile.js.map +1 -0
  140. package/dist/lib/validateModelOptions.d.ts +3 -0
  141. package/dist/lib/validateModelOptions.js +23 -0
  142. package/dist/lib/validateModelOptions.js.map +1 -0
  143. package/dist/pool.d.ts +61 -0
  144. package/dist/pool.js +512 -0
  145. package/dist/pool.js.map +1 -0
  146. package/dist/server.d.ts +59 -0
  147. package/dist/server.js +221 -0
  148. package/dist/server.js.map +1 -0
  149. package/dist/standalone.d.ts +1 -0
  150. package/dist/standalone.js +306 -0
  151. package/dist/standalone.js.map +1 -0
  152. package/dist/store.d.ts +60 -0
  153. package/dist/store.js +203 -0
  154. package/dist/store.js.map +1 -0
  155. package/dist/types/completions.d.ts +57 -0
  156. package/dist/types/completions.js +2 -0
  157. package/dist/types/completions.js.map +1 -0
  158. package/dist/types/index.d.ts +326 -0
  159. package/dist/types/index.js +2 -0
  160. package/dist/types/index.js.map +1 -0
  161. package/docs/engines.md +28 -0
  162. package/docs/gpu.md +72 -0
  163. package/docs/http-api.md +147 -0
  164. package/examples/all-options.js +108 -0
  165. package/examples/chat-cli.js +56 -0
  166. package/examples/chat-server.js +65 -0
  167. package/examples/concurrency.js +70 -0
  168. package/examples/express.js +70 -0
  169. package/examples/pool.js +91 -0
  170. package/package.json +113 -0
  171. package/src/api/openai/enums.ts +20 -0
  172. package/src/api/openai/handlers/chat.ts +408 -0
  173. package/src/api/openai/handlers/completions.ts +196 -0
  174. package/src/api/openai/handlers/embeddings.ts +92 -0
  175. package/src/api/openai/handlers/images.ts +3 -0
  176. package/src/api/openai/handlers/models.ts +33 -0
  177. package/src/api/openai/handlers/transcription.ts +2 -0
  178. package/src/api/openai/index.ts +16 -0
  179. package/src/api/parseJSONRequestBody.ts +26 -0
  180. package/src/api/v1/DRAFT.md +16 -0
  181. package/src/api/v1/index.ts +37 -0
  182. package/src/cli.ts +9 -0
  183. package/src/engines/gpt4all/engine.ts +441 -0
  184. package/src/engines/gpt4all/util.ts +31 -0
  185. package/src/engines/index.ts +28 -0
  186. package/src/engines/node-llama-cpp/engine.ts +811 -0
  187. package/src/engines/node-llama-cpp/types.ts +17 -0
  188. package/src/engines/node-llama-cpp/util.ts +126 -0
  189. package/src/engines/node-llama-cpp/validateModelFile.ts +46 -0
  190. package/src/engines/stable-diffusion-cpp/engine.ts +369 -0
  191. package/src/engines/stable-diffusion-cpp/types.ts +54 -0
  192. package/src/engines/stable-diffusion-cpp/util.ts +58 -0
  193. package/src/engines/stable-diffusion-cpp/validateModelFiles.ts +119 -0
  194. package/src/engines/transformers-js/engine.ts +659 -0
  195. package/src/engines/transformers-js/types.ts +25 -0
  196. package/src/engines/transformers-js/util.ts +40 -0
  197. package/src/engines/transformers-js/validateModelFiles.ts +168 -0
  198. package/src/experiments/ChatWithVision.ts +103 -0
  199. package/src/experiments/StableDiffPromptGenerator.ts +2 -0
  200. package/src/experiments/VoiceFunctionCall.ts +71 -0
  201. package/src/http.ts +72 -0
  202. package/src/index.ts +7 -0
  203. package/src/instance.ts +723 -0
  204. package/src/lib/acquireFileLock.ts +38 -0
  205. package/src/lib/calculateContextIdentity.ts +53 -0
  206. package/src/lib/calculateFileChecksum.ts +18 -0
  207. package/src/lib/copyDirectory.ts +29 -0
  208. package/src/lib/decodeAudio.ts +39 -0
  209. package/src/lib/downloadModelFile.ts +70 -0
  210. package/src/lib/flattenMessageTextContent.ts +19 -0
  211. package/src/lib/getCacheDirPath.ts +34 -0
  212. package/src/lib/loadImage.ts +46 -0
  213. package/src/lib/logger.ts +112 -0
  214. package/src/lib/math.ts +31 -0
  215. package/src/lib/resolveModelFileLocation.ts +49 -0
  216. package/src/lib/util.ts +75 -0
  217. package/src/lib/validateModelFile.ts +71 -0
  218. package/src/lib/validateModelOptions.ts +31 -0
  219. package/src/pool.ts +651 -0
  220. package/src/server.ts +270 -0
  221. package/src/standalone.ts +320 -0
  222. package/src/store.ts +278 -0
  223. package/src/types/completions.ts +86 -0
  224. package/src/types/index.ts +488 -0
  225. package/tsconfig.json +29 -0
  226. package/tsconfig.release.json +11 -0
  227. package/vitest.config.ts +18 -0
@@ -0,0 +1,31 @@
1
+ import { BuiltInModelOptions } from '#package/types/index.js'
2
+ import { builtInEngineNames } from '#package/engines/index.js'
3
+
4
+ const modelIdPattern = /^[a-zA-Z0-9_\-\.]+$/
5
+ export function validateModelId(id: string) {
6
+ if (!modelIdPattern.test(id)) {
7
+ throw new Error(
8
+ `Model "${id}" has invalid name; requires ${modelIdPattern}`,
9
+ )
10
+ }
11
+ }
12
+
13
+ export function validateModelOptions(
14
+ id: string,
15
+ modelOptions: BuiltInModelOptions,
16
+ ) {
17
+ validateModelId(id)
18
+ if (!modelOptions.engine) {
19
+ throw new Error(`Model "${id}" must have an engine`)
20
+ }
21
+ const isSourceMissing =
22
+ !('file' in modelOptions && modelOptions.file) &&
23
+ !modelOptions.url &&
24
+ !modelOptions.location
25
+ if (builtInEngineNames.includes(modelOptions.engine) && isSourceMissing) {
26
+ throw new Error(`Model "${id}" must have either file or url`)
27
+ }
28
+ if (!modelOptions.task) {
29
+ throw new Error(`Model "${id}" must have a task`)
30
+ }
31
+ }
package/src/pool.ts ADDED
@@ -0,0 +1,651 @@
1
+ import process from 'node:process'
2
+ import PQueue from 'p-queue'
3
+ import EventEmitter3 from 'eventemitter3'
4
+ import { ModelInstance } from '#package/instance.js'
5
+ import {
6
+ ModelConfig,
7
+ IncomingRequest,
8
+ ModelInstanceRequest,
9
+ ModelEngine,
10
+ } from '#package/types/index.js'
11
+ import {
12
+ Logger,
13
+ LogLevels,
14
+ createSublogger,
15
+ LogLevel,
16
+ } from '#package/lib/logger.js'
17
+ import { mergeAbortSignals } from '#package/lib/util.js'
18
+
19
+ export interface ModelInstanceHandle {
20
+ instance: ModelInstance
21
+ release: () => Promise<void>
22
+ }
23
+
24
+ interface ModelTask {
25
+ instance: ModelInstance
26
+ request: ModelInstanceRequest
27
+ }
28
+
29
+ type PrepareModelInstanceCallback = (
30
+ instance: ModelInstance,
31
+ signal?: AbortSignal,
32
+ ) => Promise<void>
33
+
34
+ interface ModelPoolConfig {
35
+ concurrency: number
36
+ models: Record<string, ModelConfig>
37
+ }
38
+
39
+ export interface ModelPoolOptions {
40
+ concurrency?: number
41
+ models: Record<string, ModelConfig>
42
+ log?: Logger | LogLevel
43
+ }
44
+
45
+ type ModelPoolEvent = 'ready' | 'spawn' | 'release'
46
+
47
+ export class ModelPool extends EventEmitter3<ModelPoolEvent> {
48
+ queue: PQueue
49
+ config: ModelPoolConfig
50
+ instances: Record<string, ModelInstance>
51
+ private engines?: Record<string, ModelEngine>
52
+ private cleanupInterval?: NodeJS.Timeout
53
+ private log: Logger
54
+ private requestSequence: number = 0
55
+ private pendingRequests: Set<ModelInstanceRequest> = new Set()
56
+ private shutdownController: AbortController = new AbortController()
57
+ private gpuLock: boolean = false // TODO could derive this from "is there any instance that has gpu=true"
58
+ private prepareInstance?: PrepareModelInstanceCallback
59
+
60
+ constructor(
61
+ options: ModelPoolOptions,
62
+ prepareInstance?: PrepareModelInstanceCallback,
63
+ ) {
64
+ super()
65
+ this.log = createSublogger(options.log)
66
+ const models: Record<string, ModelConfig> = {}
67
+ for (const id in options.models) {
68
+ const modelConfig = options.models[id]
69
+ models[id] = {
70
+ ...modelConfig,
71
+ id: modelConfig.id ?? id,
72
+ }
73
+ }
74
+ const config: ModelPoolConfig = {
75
+ concurrency: 1,
76
+ ...options,
77
+ models,
78
+ }
79
+ this.queue = new PQueue({
80
+ concurrency: config.concurrency,
81
+ })
82
+ this.config = config
83
+ this.instances = {}
84
+ this.prepareInstance = prepareInstance
85
+ }
86
+
87
+ // start up pool, creating instances and loading models
88
+ async init(engines: Record<string, ModelEngine>) {
89
+ const initPromises = []
90
+ const modelConfigs = this.config.models
91
+ this.engines = engines
92
+
93
+ // making sure id is set.
94
+ for (const modelId in modelConfigs) {
95
+ const modelConfig = modelConfigs[modelId]
96
+ if (!modelConfig.id) {
97
+ modelConfig.id = modelId
98
+ }
99
+ }
100
+
101
+ // prioritize initializing the first model defined that has gpu explicitly set
102
+ // so lock cant be acquired first by another model that has gpu=auto/undefined
103
+ const firstGpuModel = Object.entries(modelConfigs).find(
104
+ ([id, config]) => !!config.device?.gpu && config.device?.gpu !== 'auto',
105
+ )
106
+ if (firstGpuModel) {
107
+ const modelConfig = modelConfigs[firstGpuModel[0]]
108
+ const spawnPromises = this.ensureModelInstances(modelConfig)
109
+ initPromises.push(...spawnPromises)
110
+ }
111
+
112
+ // then handle other models in the order they were defined
113
+ for (const modelId in modelConfigs) {
114
+ if (firstGpuModel && modelId === firstGpuModel[0]) {
115
+ continue
116
+ }
117
+ const modelConfig = modelConfigs[modelId]
118
+ const spawnPromises = this.ensureModelInstances(modelConfig)
119
+ initPromises.push(...spawnPromises)
120
+ }
121
+
122
+ // resolve when all initial instances are loaded
123
+ await Promise.allSettled(initPromises)
124
+ this.emit('ready')
125
+ this.cleanupInterval = setInterval(() => {
126
+ this.disposeOutdatedInstances()
127
+ }, 1000 * 60) // every minute
128
+ }
129
+
130
+ // see if the minInstances for a models are spawned. if not, spawn them.
131
+ ensureModelInstances(model: ModelConfig) {
132
+ const spawnPromises = []
133
+ const instanceCount = model.minInstances ?? 0
134
+ for (let i = 0; i < instanceCount; i++) {
135
+ if (this.canSpawnInstance(model.id)) {
136
+ const spawnPromise = this.spawnInstance(model.id)
137
+ spawnPromises.push(spawnPromise)
138
+ } else {
139
+ this.log(LogLevels.warn, 'Failed to spawn min instances for', {
140
+ model: model.id,
141
+ })
142
+ break
143
+ }
144
+ }
145
+ return spawnPromises
146
+ }
147
+
148
+ async dispose() {
149
+ this.log(LogLevels.debug, 'Disposing pool')
150
+ clearInterval(this.cleanupInterval)
151
+ super.removeAllListeners()
152
+ this.queue.pause()
153
+ this.queue.clear()
154
+ this.shutdownController.abort()
155
+ for (const request of this.pendingRequests) {
156
+ request.abortController.abort()
157
+ }
158
+ const disposePromises: Array<Promise<void>> = []
159
+ for (const key in this.instances) {
160
+ const instance = this.instances[key]
161
+ disposePromises.push(this.disposeInstance(instance))
162
+ }
163
+ await Promise.allSettled(disposePromises)
164
+ }
165
+
166
+ // disposes instances that have been idle for longer than their ttl
167
+ private disposeOutdatedInstances() {
168
+ const now = new Date().getTime()
169
+ for (const key in this.instances) {
170
+ const instance = this.instances[key]
171
+ const instanceAge = (now - instance.lastUsed) / 1000
172
+ const modelInstanceCount = Object.values(this.instances).filter(
173
+ (i) => i.modelId === instance.modelId,
174
+ ).length
175
+ const minInstanceCount =
176
+ this.config.models[instance.modelId].minInstances ?? 0
177
+ if (
178
+ modelInstanceCount > minInstanceCount &&
179
+ instanceAge > instance.ttl &&
180
+ instance.status === 'idle'
181
+ ) {
182
+ this.log(LogLevels.info, 'Auto disposing instance', {
183
+ instance: instance.id,
184
+ })
185
+ this.disposeInstance(instance)
186
+ }
187
+ }
188
+ }
189
+
190
+ getStatus() {
191
+ const processingInstances = Object.values(this.instances).filter(
192
+ (instance) => instance.status === 'busy',
193
+ )
194
+ const poolStatusInfo = {
195
+ processing: processingInstances.length,
196
+ pending: this.pendingRequests.size,
197
+ instances: Object.fromEntries(
198
+ Object.entries(this.instances).map(([key, instance]) => {
199
+ return [
200
+ key,
201
+ {
202
+ model: instance.modelId,
203
+ status: instance.status,
204
+ engine: instance.config.engine,
205
+ device: instance.gpu ? 'gpu' : 'cpu',
206
+ contextState: instance.getContextStateIdentity(),
207
+ lastUsed: new Date(instance.lastUsed).toISOString(),
208
+ },
209
+ ]
210
+ }),
211
+ ),
212
+ }
213
+ return poolStatusInfo
214
+ }
215
+
216
+ // checks if another instance can be spawned for given model
217
+ canSpawnInstance(modelId: string) {
218
+ const modelConfig = this.config.models[modelId]
219
+ // if the model is configured with gpu=true, interpret that as "it MUST run on gpu"
220
+ // and prevent spawning more instances if the gpu is already locked.
221
+ const requiresGpu = !!modelConfig.device?.gpu
222
+ if (requiresGpu && this.gpuLock && modelConfig.device?.gpu !== 'auto') {
223
+ this.log(
224
+ LogLevels.debug,
225
+ 'Cannot spawn new instance: model requires gpu, but its already in use',
226
+ { model: modelId },
227
+ )
228
+ return false
229
+ }
230
+ // see if we're within maxInstances limit
231
+ const maxInstances = modelConfig.maxInstances ?? 1
232
+ const currentInstances = Object.values(this.instances).filter(
233
+ (instance) => instance.modelId === modelId,
234
+ )
235
+ if (currentInstances.length >= maxInstances) {
236
+ this.log(
237
+ LogLevels.debug,
238
+ 'Cannot spawn new instance: maxInstances reached',
239
+ { model: modelId, curent: currentInstances.length, max: maxInstances },
240
+ )
241
+ return false
242
+ }
243
+ return true
244
+ }
245
+
246
+ private async disposeInstance(instance: ModelInstance) {
247
+ this.log(LogLevels.debug, 'Disposing instance', {
248
+ instance: instance.id,
249
+ })
250
+ await instance.dispose()
251
+ if (instance.gpu) {
252
+ this.gpuLock = false
253
+ }
254
+ delete this.instances[instance.id]
255
+ }
256
+
257
+ // spawns a new instance for the given model, without checking whether it's allowed
258
+ private async spawnInstance(
259
+ modelId: string,
260
+ options: { signal?: AbortSignal; emit?: boolean } = {},
261
+ ) {
262
+ if (!this.engines) {
263
+ throw new Error(
264
+ 'No engines available. Make sure the pool is initialized and ModelServer.start() or ModelPool.init() were called.',
265
+ )
266
+ }
267
+ const model = this.config.models[modelId]
268
+ const engine = this.engines[model.engine]
269
+ if (!engine) {
270
+ throw new Error(`Engine not found: ${model.engine}`)
271
+ }
272
+ const autoGpuEnabled = !!engine.autoGpu
273
+
274
+ // if the model is configured with gpu=auto (or unset), we can use the gpu if its not locked
275
+ const autoGpu =
276
+ model.device?.gpu === undefined || model.device?.gpu === 'auto'
277
+ let useGpu = autoGpu ? autoGpuEnabled && !this.gpuLock : false
278
+
279
+ if (!!model.device?.gpu) {
280
+ useGpu = true
281
+ }
282
+
283
+ const instance = new ModelInstance(engine, {
284
+ ...model,
285
+ gpu: useGpu,
286
+ log: this.log,
287
+ })
288
+ this.instances[instance.id] = instance
289
+
290
+ if (useGpu) {
291
+ this.gpuLock = true
292
+ }
293
+ const signals = [this.shutdownController.signal]
294
+ if (options.signal) {
295
+ signals.push(options.signal)
296
+ }
297
+ const abortSignal = mergeAbortSignals(signals)
298
+ if (this.prepareInstance) {
299
+ this.log(LogLevels.debug, 'Preparing instance', {
300
+ instance: instance.id,
301
+ })
302
+ try {
303
+ await this.prepareInstance(instance, abortSignal)
304
+ instance.status = 'idle'
305
+ } catch (error) {
306
+ this.log(LogLevels.error, 'Error preparing instance', {
307
+ model: modelId,
308
+ instance: instance.id,
309
+ error,
310
+ })
311
+ instance.status = 'error'
312
+ return instance
313
+ }
314
+ }
315
+ await instance.load(abortSignal)
316
+ if (options.emit !== false) {
317
+ this.emit('spawn', instance)
318
+ }
319
+ return instance
320
+ }
321
+
322
+ // wait to acquire a gpu instance for the given request
323
+ private acquireGpuInstance(
324
+ request: ModelInstanceRequest,
325
+ signal?: AbortSignal,
326
+ ): Promise<ModelInstance> {
327
+ return new Promise(async (resolve, reject) => {
328
+ // if we have an idle gpu instance and the model matches we can lock and return immediately
329
+ const gpuInstance = Object.values(this.instances).find(
330
+ (instance) => instance.gpu === true,
331
+ )!
332
+ if (gpuInstance.status === 'idle') {
333
+ if (gpuInstance.modelId === request.model) {
334
+ gpuInstance.lock(request)
335
+ resolve(gpuInstance)
336
+ return
337
+ } else {
338
+ await this.disposeInstance(gpuInstance)
339
+
340
+ const newInstance = await this.spawnInstance(request.model, {
341
+ emit: false,
342
+ })
343
+ newInstance.lock(request)
344
+ resolve(newInstance)
345
+ return
346
+ }
347
+ }
348
+
349
+ // otherwise attach the listener and wait until gpu slot becomes available
350
+ const listener = async (instance: ModelInstance) => {
351
+ if (instance.gpu === true && instance.status === 'idle') {
352
+ if (instance.matchesRequirements(request)) {
353
+ // model matches whats needed, lock and resolve
354
+ this.off('release', listener)
355
+ instance.lock(request)
356
+ resolve(instance)
357
+ } else {
358
+ // model doesnt match, dispose and spawn new instance
359
+ this.off('release', listener)
360
+ await this.disposeInstance(instance)
361
+ const newInstance = await this.spawnInstance(request.model, {
362
+ emit: false,
363
+ })
364
+ newInstance.lock(request)
365
+ resolve(newInstance)
366
+ }
367
+ }
368
+ }
369
+ this.on('release', listener)
370
+ if (signal) {
371
+ signal.addEventListener('abort', () => {
372
+ this.off('release', listener)
373
+ reject(signal.reason)
374
+ })
375
+ }
376
+ })
377
+ }
378
+
379
+ // wait to acquire an idle instance for the given request
380
+ private acquireIdleInstance(
381
+ request: ModelInstanceRequest,
382
+ signal?: AbortSignal,
383
+ ): Promise<ModelInstance> {
384
+ return new Promise((resolve, reject) => {
385
+ const listener = (instance: ModelInstance) => {
386
+ if (
387
+ instance.matchesRequirements(request) &&
388
+ instance.status === 'idle'
389
+ ) {
390
+ this.off('release', listener)
391
+ this.off('spawn', listener)
392
+ try {
393
+ instance.lock(request)
394
+ resolve(instance)
395
+ } catch (error: any) {
396
+ this.log(LogLevels.error, 'Error acquiring idle instance', {
397
+ error,
398
+ })
399
+ reject(error)
400
+ }
401
+ }
402
+ }
403
+ this.on('spawn', listener)
404
+ this.on('release', listener)
405
+ if (signal) {
406
+ signal.addEventListener('abort', () => {
407
+ this.off('release', listener)
408
+ this.off('spawn', listener)
409
+ reject(signal.reason)
410
+ })
411
+ }
412
+ })
413
+ }
414
+
415
+ // acquire an instance for the given request
416
+ private async acquireInstance(
417
+ request: ModelInstanceRequest,
418
+ signal?: AbortSignal,
419
+ ) {
420
+ if ('messages' in request || 'prompt' in request) {
421
+ // for text and chat completions first search for an instance that has the context ready
422
+ for (const key in this.instances) {
423
+ const instance = this.instances[key]
424
+ if (
425
+ instance.matchesRequirements(request) &&
426
+ instance.status === 'idle' &&
427
+ instance.matchesContextState(request)
428
+ ) {
429
+ this.log(LogLevels.debug, 'Cache hit - reusing cached instance', {
430
+ instance: instance.id,
431
+ sequence: request.sequence,
432
+ })
433
+ instance.lock(request)
434
+ return instance
435
+ }
436
+ }
437
+ this.log(
438
+ LogLevels.debug,
439
+ 'Cache miss - continue acquiring model instance',
440
+ { sequence: request.sequence },
441
+ )
442
+ }
443
+
444
+ // prefer an instance of the model that has no context state.
445
+ for (const key in this.instances) {
446
+ const instance = this.instances[key]
447
+ if (
448
+ instance.matchesRequirements(request) &&
449
+ instance.status === 'idle' &&
450
+ !instance.hasContextState()
451
+ ) {
452
+ this.log(
453
+ LogLevels.debug,
454
+ 'Reusing idle instance without context state',
455
+ {
456
+ instance: instance.id,
457
+ sequence: request.sequence,
458
+ },
459
+ )
460
+ instance.lock(request)
461
+ return instance
462
+ }
463
+ }
464
+
465
+ // still havent found any, see if we're allowed to spawn a new instance
466
+ if (this.canSpawnInstance(request.model)) {
467
+ const instance = await this.spawnInstance(request.model, {
468
+ emit: false,
469
+ })
470
+ // reset the context if the request doesnt match the instances preloaded context state
471
+ const hasInitialContextState = instance.config.initialMessages?.length || instance.config.prefix
472
+ if (hasInitialContextState && !instance.matchesContextState(request)) {
473
+ instance.resetContext()
474
+ }
475
+ this.log(LogLevels.debug, 'Spawned instance acquired', {
476
+ instance: instance.id,
477
+ sequence: request.sequence,
478
+ })
479
+ instance.lock(request)
480
+ return instance
481
+ }
482
+
483
+ // if all instances have cached state, prefer the one that was used the longest time ago
484
+ const availableInstances = Object.values(this.instances).filter(
485
+ (instance) =>
486
+ instance.matchesRequirements(request) && instance.status === 'idle',
487
+ )
488
+ if (availableInstances.length > 0) {
489
+ const lruInstance = availableInstances.reduce((prev, current) =>
490
+ prev.lastUsed < current.lastUsed ? prev : current,
491
+ )
492
+ this.log(LogLevels.debug, 'Reusing least recently used instance', {
493
+ instance: lruInstance.id,
494
+ sequence: request.sequence,
495
+ })
496
+ lruInstance.lock(request)
497
+ lruInstance.resetContext() // make sure we reset its cache.
498
+ return lruInstance
499
+ }
500
+
501
+ const requiresGpu = this.config.models[request.model].device?.gpu === true
502
+ if (requiresGpu && this.gpuLock) {
503
+ const gpuInstance = Object.values(this.instances).find(
504
+ (instance) => instance.gpu === true,
505
+ )!
506
+
507
+ if (gpuInstance.modelId !== request.model) {
508
+ this.log(LogLevels.debug, 'GPU already in use, waiting ...', {
509
+ sequence: request.sequence,
510
+ })
511
+ const instance = await this.acquireGpuInstance(request, signal)
512
+ this.log(LogLevels.debug, 'GPU instance acquired', {
513
+ instance: instance.id,
514
+ sequence: request.sequence,
515
+ })
516
+ if (signal?.aborted) {
517
+ instance.unlock()
518
+ throw signal.reason
519
+ } else {
520
+ return instance
521
+ }
522
+ }
523
+ }
524
+
525
+ // before starting to wait, make sure sure we're not stuck with an error'd instance (and wait forever)
526
+ // currently instances only enter error state if prepareInstance throws an error
527
+ const errorInstance = Object.values(this.instances).find(
528
+ (instance) =>
529
+ instance.modelId === request.model && instance.status === 'error',
530
+ )
531
+ if (errorInstance) {
532
+ throw new Error('Instance is in error state')
533
+ }
534
+
535
+ // wait until an instance of our model is released or spawned
536
+ this.log(LogLevels.debug, 'Awaiting idle instance', {
537
+ model: request.model,
538
+ sequence: request.sequence,
539
+ })
540
+ const instance = await this.acquireIdleInstance(request, signal)
541
+ this.log(LogLevels.debug, 'Idle instance acquired', {
542
+ instance: instance.id,
543
+ sequence: request.sequence,
544
+ })
545
+
546
+ if (signal?.aborted) {
547
+ instance.unlock()
548
+ throw signal.reason
549
+ } else {
550
+ return instance
551
+ }
552
+ }
553
+
554
+ private createRequestSequence() {
555
+ if (this.requestSequence > 999999) {
556
+ this.requestSequence = 0
557
+ }
558
+ return ++this.requestSequence
559
+ }
560
+
561
+ // requests a model instance from the pool
562
+ async requestInstance(
563
+ incomingRequest: IncomingRequest,
564
+ signal?: AbortSignal,
565
+ ): Promise<ModelInstanceHandle> {
566
+ if (this.shutdownController.signal.aborted) {
567
+ throw new Error('Pool is disposed')
568
+ }
569
+ const requestSequence = this.createRequestSequence()
570
+ const request = {
571
+ ...incomingRequest,
572
+ sequence: requestSequence,
573
+ abortController: new AbortController(),
574
+ }
575
+ if (!this.config.models[request.model]) {
576
+ this.log(LogLevels.error, `Model not found: ${request.model}`)
577
+ throw new Error(`Model not found: ${request.model}`)
578
+ }
579
+
580
+ this.log(LogLevels.info, 'Incoming request', {
581
+ model: request.model,
582
+ sequence: request.sequence,
583
+ })
584
+
585
+ this.pendingRequests.add(request)
586
+ const abortSignal = mergeAbortSignals([
587
+ request.abortController.signal,
588
+ signal,
589
+ ])
590
+ abortSignal.addEventListener('abort', () => {
591
+ this.log(LogLevels.info, 'Request aborted', {
592
+ model: request.model,
593
+ sequence: request.sequence,
594
+ })
595
+ this.pendingRequests.delete(request)
596
+ })
597
+ const instance = await this.acquireInstance(request, abortSignal)
598
+
599
+ // once instance is acquired & locked, we can pass it on to the caller
600
+ // the queue task promise will be forwarded as releaseInstance
601
+ let resolveQueueTask: (value: ModelTask) => void = () => {}
602
+
603
+ this.queue
604
+ .add((): Promise<ModelTask> => {
605
+ this.pendingRequests.delete(request)
606
+ return new Promise((resolve, reject) => {
607
+ resolveQueueTask = resolve
608
+ })
609
+ })
610
+ .then((task) => {
611
+ // if there are more requests waiting, prioritize handling them before spawning new instances
612
+ // deferred to avoid AbortError when the pool is disposed right after the operation
613
+ process.nextTick(() => {
614
+ if (
615
+ !this.pendingRequests.size &&
616
+ this.canSpawnInstance(request.model) &&
617
+ !this.shutdownController.signal.aborted
618
+ ) {
619
+ this.spawnInstance(request.model)
620
+ }
621
+ })
622
+ if (task?.instance) {
623
+ this.emit('release', instance)
624
+ }
625
+ })
626
+
627
+ // TODO what if user never calls release? automatically resolve or reject after a timeout?
628
+ const releaseInstance = () => {
629
+ return new Promise<void>((resolve, reject) => {
630
+ process.nextTick(() => {
631
+ resolveQueueTask({ instance, request })
632
+ this.log(LogLevels.info, 'Task completed, releasing', {
633
+ instance: instance.id,
634
+ sequence: request.sequence,
635
+ })
636
+ if (instance.config.ttl === 0) {
637
+ this.disposeInstance(instance)
638
+ } else {
639
+ instance.unlock()
640
+ }
641
+ resolve()
642
+ })
643
+ })
644
+ }
645
+
646
+ return {
647
+ instance,
648
+ release: releaseInstance,
649
+ }
650
+ }
651
+ }