npm - inference-server - Versions diffs - 1.0.0-beta.19 - Mend

inference-server 1.0.0-beta.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (227) hide show

package/README.md +216 -0
package/dist/api/openai/enums.d.ts +4 -0
package/dist/api/openai/enums.js +17 -0
package/dist/api/openai/enums.js.map +1 -0
package/dist/api/openai/handlers/chat.d.ts +3 -0
package/dist/api/openai/handlers/chat.js +358 -0
package/dist/api/openai/handlers/chat.js.map +1 -0
package/dist/api/openai/handlers/completions.d.ts +3 -0
package/dist/api/openai/handlers/completions.js +169 -0
package/dist/api/openai/handlers/completions.js.map +1 -0
package/dist/api/openai/handlers/embeddings.d.ts +3 -0
package/dist/api/openai/handlers/embeddings.js +74 -0
package/dist/api/openai/handlers/embeddings.js.map +1 -0
package/dist/api/openai/handlers/images.d.ts +0 -0
package/dist/api/openai/handlers/images.js +4 -0
package/dist/api/openai/handlers/images.js.map +1 -0
package/dist/api/openai/handlers/models.d.ts +3 -0
package/dist/api/openai/handlers/models.js +23 -0
package/dist/api/openai/handlers/models.js.map +1 -0
package/dist/api/openai/handlers/transcription.d.ts +0 -0
package/dist/api/openai/handlers/transcription.js +4 -0
package/dist/api/openai/handlers/transcription.js.map +1 -0
package/dist/api/openai/index.d.ts +7 -0
package/dist/api/openai/index.js +14 -0
package/dist/api/openai/index.js.map +1 -0
package/dist/api/parseJSONRequestBody.d.ts +2 -0
package/dist/api/parseJSONRequestBody.js +24 -0
package/dist/api/parseJSONRequestBody.js.map +1 -0
package/dist/api/v1/index.d.ts +2 -0
package/dist/api/v1/index.js +29 -0
package/dist/api/v1/index.js.map +1 -0
package/dist/cli.d.ts +1 -0
package/dist/cli.js +10 -0
package/dist/cli.js.map +1 -0
package/dist/engines/gpt4all/engine.d.ts +34 -0
package/dist/engines/gpt4all/engine.js +357 -0
package/dist/engines/gpt4all/engine.js.map +1 -0
package/dist/engines/gpt4all/util.d.ts +3 -0
package/dist/engines/gpt4all/util.js +29 -0
package/dist/engines/gpt4all/util.js.map +1 -0
package/dist/engines/index.d.ts +19 -0
package/dist/engines/index.js +21 -0
package/dist/engines/index.js.map +1 -0
package/dist/engines/node-llama-cpp/engine.d.ts +49 -0
package/dist/engines/node-llama-cpp/engine.js +666 -0
package/dist/engines/node-llama-cpp/engine.js.map +1 -0
package/dist/engines/node-llama-cpp/types.d.ts +13 -0
package/dist/engines/node-llama-cpp/types.js +2 -0
package/dist/engines/node-llama-cpp/types.js.map +1 -0
package/dist/engines/node-llama-cpp/util.d.ts +15 -0
package/dist/engines/node-llama-cpp/util.js +84 -0
package/dist/engines/node-llama-cpp/util.js.map +1 -0
package/dist/engines/node-llama-cpp/validateModelFile.d.ts +8 -0
package/dist/engines/node-llama-cpp/validateModelFile.js +36 -0
package/dist/engines/node-llama-cpp/validateModelFile.js.map +1 -0
package/dist/engines/stable-diffusion-cpp/engine.d.ts +90 -0
package/dist/engines/stable-diffusion-cpp/engine.js +294 -0
package/dist/engines/stable-diffusion-cpp/engine.js.map +1 -0
package/dist/engines/stable-diffusion-cpp/types.d.ts +3 -0
package/dist/engines/stable-diffusion-cpp/types.js +2 -0
package/dist/engines/stable-diffusion-cpp/types.js.map +1 -0
package/dist/engines/stable-diffusion-cpp/util.d.ts +4 -0
package/dist/engines/stable-diffusion-cpp/util.js +55 -0
package/dist/engines/stable-diffusion-cpp/util.js.map +1 -0
package/dist/engines/stable-diffusion-cpp/validateModelFiles.d.ts +19 -0
package/dist/engines/stable-diffusion-cpp/validateModelFiles.js +91 -0
package/dist/engines/stable-diffusion-cpp/validateModelFiles.js.map +1 -0
package/dist/engines/transformers-js/engine.d.ts +37 -0
package/dist/engines/transformers-js/engine.js +538 -0
package/dist/engines/transformers-js/engine.js.map +1 -0
package/dist/engines/transformers-js/types.d.ts +7 -0
package/dist/engines/transformers-js/types.js +2 -0
package/dist/engines/transformers-js/types.js.map +1 -0
package/dist/engines/transformers-js/util.d.ts +7 -0
package/dist/engines/transformers-js/util.js +36 -0
package/dist/engines/transformers-js/util.js.map +1 -0
package/dist/engines/transformers-js/validateModelFiles.d.ts +17 -0
package/dist/engines/transformers-js/validateModelFiles.js +133 -0
package/dist/engines/transformers-js/validateModelFiles.js.map +1 -0
package/dist/experiments/ChatWithVision.d.ts +11 -0
package/dist/experiments/ChatWithVision.js +91 -0
package/dist/experiments/ChatWithVision.js.map +1 -0
package/dist/experiments/StableDiffPromptGenerator.d.ts +0 -0
package/dist/experiments/StableDiffPromptGenerator.js +4 -0
package/dist/experiments/StableDiffPromptGenerator.js.map +1 -0
package/dist/experiments/VoiceFunctionCall.d.ts +18 -0
package/dist/experiments/VoiceFunctionCall.js +51 -0
package/dist/experiments/VoiceFunctionCall.js.map +1 -0
package/dist/http.d.ts +19 -0
package/dist/http.js +54 -0
package/dist/http.js.map +1 -0
package/dist/index.d.ts +7 -0
package/dist/index.js +8 -0
package/dist/index.js.map +1 -0
package/dist/instance.d.ts +88 -0
package/dist/instance.js +594 -0
package/dist/instance.js.map +1 -0
package/dist/lib/acquireFileLock.d.ts +7 -0
package/dist/lib/acquireFileLock.js +38 -0
package/dist/lib/acquireFileLock.js.map +1 -0
package/dist/lib/calculateContextIdentity.d.ts +7 -0
package/dist/lib/calculateContextIdentity.js +39 -0
package/dist/lib/calculateContextIdentity.js.map +1 -0
package/dist/lib/calculateFileChecksum.d.ts +1 -0
package/dist/lib/calculateFileChecksum.js +16 -0
package/dist/lib/calculateFileChecksum.js.map +1 -0
package/dist/lib/copyDirectory.d.ts +6 -0
package/dist/lib/copyDirectory.js +27 -0
package/dist/lib/copyDirectory.js.map +1 -0
package/dist/lib/decodeAudio.d.ts +1 -0
package/dist/lib/decodeAudio.js +26 -0
package/dist/lib/decodeAudio.js.map +1 -0
package/dist/lib/downloadModelFile.d.ts +10 -0
package/dist/lib/downloadModelFile.js +58 -0
package/dist/lib/downloadModelFile.js.map +1 -0
package/dist/lib/flattenMessageTextContent.d.ts +2 -0
package/dist/lib/flattenMessageTextContent.js +11 -0
package/dist/lib/flattenMessageTextContent.js.map +1 -0
package/dist/lib/getCacheDirPath.d.ts +12 -0
package/dist/lib/getCacheDirPath.js +31 -0
package/dist/lib/getCacheDirPath.js.map +1 -0
package/dist/lib/loadImage.d.ts +12 -0
package/dist/lib/loadImage.js +30 -0
package/dist/lib/loadImage.js.map +1 -0
package/dist/lib/logger.d.ts +12 -0
package/dist/lib/logger.js +98 -0
package/dist/lib/logger.js.map +1 -0
package/dist/lib/math.d.ts +7 -0
package/dist/lib/math.js +30 -0
package/dist/lib/math.js.map +1 -0
package/dist/lib/resolveModelFileLocation.d.ts +15 -0
package/dist/lib/resolveModelFileLocation.js +41 -0
package/dist/lib/resolveModelFileLocation.js.map +1 -0
package/dist/lib/util.d.ts +7 -0
package/dist/lib/util.js +61 -0
package/dist/lib/util.js.map +1 -0
package/dist/lib/validateModelFile.d.ts +9 -0
package/dist/lib/validateModelFile.js +62 -0
package/dist/lib/validateModelFile.js.map +1 -0
package/dist/lib/validateModelOptions.d.ts +3 -0
package/dist/lib/validateModelOptions.js +23 -0
package/dist/lib/validateModelOptions.js.map +1 -0
package/dist/pool.d.ts +61 -0
package/dist/pool.js +512 -0
package/dist/pool.js.map +1 -0
package/dist/server.d.ts +59 -0
package/dist/server.js +221 -0
package/dist/server.js.map +1 -0
package/dist/standalone.d.ts +1 -0
package/dist/standalone.js +306 -0
package/dist/standalone.js.map +1 -0
package/dist/store.d.ts +60 -0
package/dist/store.js +203 -0
package/dist/store.js.map +1 -0
package/dist/types/completions.d.ts +57 -0
package/dist/types/completions.js +2 -0
package/dist/types/completions.js.map +1 -0
package/dist/types/index.d.ts +326 -0
package/dist/types/index.js +2 -0
package/dist/types/index.js.map +1 -0
package/docs/engines.md +28 -0
package/docs/gpu.md +72 -0
package/docs/http-api.md +147 -0
package/examples/all-options.js +108 -0
package/examples/chat-cli.js +56 -0
package/examples/chat-server.js +65 -0
package/examples/concurrency.js +70 -0
package/examples/express.js +70 -0
package/examples/pool.js +91 -0
package/package.json +113 -0
package/src/api/openai/enums.ts +20 -0
package/src/api/openai/handlers/chat.ts +408 -0
package/src/api/openai/handlers/completions.ts +196 -0
package/src/api/openai/handlers/embeddings.ts +92 -0
package/src/api/openai/handlers/images.ts +3 -0
package/src/api/openai/handlers/models.ts +33 -0
package/src/api/openai/handlers/transcription.ts +2 -0
package/src/api/openai/index.ts +16 -0
package/src/api/parseJSONRequestBody.ts +26 -0
package/src/api/v1/DRAFT.md +16 -0
package/src/api/v1/index.ts +37 -0
package/src/cli.ts +9 -0
package/src/engines/gpt4all/engine.ts +441 -0
package/src/engines/gpt4all/util.ts +31 -0
package/src/engines/index.ts +28 -0
package/src/engines/node-llama-cpp/engine.ts +811 -0
package/src/engines/node-llama-cpp/types.ts +17 -0
package/src/engines/node-llama-cpp/util.ts +126 -0
package/src/engines/node-llama-cpp/validateModelFile.ts +46 -0
package/src/engines/stable-diffusion-cpp/engine.ts +369 -0
package/src/engines/stable-diffusion-cpp/types.ts +54 -0
package/src/engines/stable-diffusion-cpp/util.ts +58 -0
package/src/engines/stable-diffusion-cpp/validateModelFiles.ts +119 -0
package/src/engines/transformers-js/engine.ts +659 -0
package/src/engines/transformers-js/types.ts +25 -0
package/src/engines/transformers-js/util.ts +40 -0
package/src/engines/transformers-js/validateModelFiles.ts +168 -0
package/src/experiments/ChatWithVision.ts +103 -0
package/src/experiments/StableDiffPromptGenerator.ts +2 -0
package/src/experiments/VoiceFunctionCall.ts +71 -0
package/src/http.ts +72 -0
package/src/index.ts +7 -0
package/src/instance.ts +723 -0
package/src/lib/acquireFileLock.ts +38 -0
package/src/lib/calculateContextIdentity.ts +53 -0
package/src/lib/calculateFileChecksum.ts +18 -0
package/src/lib/copyDirectory.ts +29 -0
package/src/lib/decodeAudio.ts +39 -0
package/src/lib/downloadModelFile.ts +70 -0
package/src/lib/flattenMessageTextContent.ts +19 -0
package/src/lib/getCacheDirPath.ts +34 -0
package/src/lib/loadImage.ts +46 -0
package/src/lib/logger.ts +112 -0
package/src/lib/math.ts +31 -0
package/src/lib/resolveModelFileLocation.ts +49 -0
package/src/lib/util.ts +75 -0
package/src/lib/validateModelFile.ts +71 -0
package/src/lib/validateModelOptions.ts +31 -0
package/src/pool.ts +651 -0
package/src/server.ts +270 -0
package/src/standalone.ts +320 -0
package/src/store.ts +278 -0
package/src/types/completions.ts +86 -0
package/src/types/index.ts +488 -0
package/tsconfig.json +29 -0
package/tsconfig.release.json +11 -0
package/vitest.config.ts +18 -0

package/docs/gpu.md ADDED Viewed

@@ -0,0 +1,72 @@
+### On GPU usage
+Only one model instance can run on gpu at a time. Instances can not switch between gpu and cpu. If left unconfigured, the first spawned instance of a model will automatically acquire gpu lock and use it. Note that if `minInstances` is set to something greater than 0 then the order in which models are configured will matter because initial instances will also be spawned in that order.
+Automatic / unconfigured gpu usage:
+```ts
+{
+  models: {
+    'model1': {
+      task: 'text-completion',
+      engine: 'gpt4all',
+      url: 'https://gpt4all.io/models/gguf/Meta-Llama-3-8B-Instruct.Q4_0.gguf',
+      // first instance will automatically pick up gpu, then a second one will be spawned on cpu
+      minInstances: 2,
+    },
+    'model2': {
+      task: 'text-completion',
+      engine: 'gpt4all',
+      url: 'https://huggingface.co/NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF/resolve/main/Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf',
+      minInstances: 1, // this will always spawn on cpu because model1 already auto locked gpu
+    },
+  },
+}
+```
+Another practical strategy is to explicitly configure gpu usage. The same model can be configured multiple times with different options so that the gpu instance can be targeted specifically. Like
+```ts
+{
+  models: {
+    'model1': {
+      task: 'text-completion',
+      engine: 'gpt4all',
+      url: 'https://gpt4all.io/models/gguf/Meta-Llama-3-8B-Instruct.Q4_0.gguf',
+      device: { gpu: true }, // this is effectively also maxInstances: 1
+    },
+    'model2': {
+      task: 'text-completion',
+      engine: 'gpt4all',
+      url: 'https://gpt4all.io/models/gguf/Meta-Llama-3-8B-Instruct.Q4_0.gguf',
+      // will spawn up to 2 cpu instances
+      device: { gpu: false },
+      maxInstances: 2,
+    },
+  },
+}
+```
+It is possible to configure multiple models to use gpu, but only one gpu instance can be utilized at a time. If simultaneous requests to gpu models come in, each request will wait for the processing instance to release gpu lock before it can start. Then, depending on which models are requested in which order, instances may be disposed and spawned, or reused.
+```ts
+{
+  models: {
+    'model1': {
+      task: 'text-completion',
+      engine: 'gpt4all',
+      url: 'https://gpt4all.io/models/gguf/Meta-Llama-3-8B-Instruct.Q4_0.gguf',
+      device: { gpu: true },
+    },
+    'model2': {
+      task: 'text-completion',
+      engine: 'gpt4all',
+      url: 'https://huggingface.co/NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF/resolve/main/Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf',
+      device: { gpu: true },
+    },
+  },
+}
+```
+Note that switching the model that runs on gpu a lot (iE incoming requests `model1->model2->model1->model2) will lead to inefficient cache usage for chat completions and generally make requests slower, because models need to be unloaded and loaded + chat history has to be reingested.

package/docs/http-api.md ADDED Viewed

@@ -0,0 +1,147 @@
+### HTTP API
+Note that the HTTP API is currently not secure (ie it's probably DoS-able, only minimal input validation). You should not host this on a public server without additional protections.
+On the packaged web server there is currently only one additional HTTP endpoint:
+- `GET /` - Prints info about spawned instances, available models and ongoing downloads.
+#### OpenAI-Style API
+`/openai/v1` is the default base path. The following endpoints and parameters are supported:
+| Endpoints               | gpt4all | node-llama-cpp | transformers-js |
+| ----------------------- | ------- | -------------- | --------------- |
+| v1/chat/completions     | ✅      | ✅             | 🚧              |
+| v1/completions          | ✅      | ✅             | 🚧              |
+| v1/embeddings           | ✅      | ✅             | 🚧              |
+| v1/models               | ✅      | ✅             | ✅              |
+| v1/audio/transcriptions | ❌      | ❌             | 🚧              |
+| Text Compl Params   | gpt4all | node-llama-cpp |
+| ------------------- | ------- | -------------- |
+| stream              | ✅      | ✅             |
+| temperature         | ✅      | ✅             |
+| max_tokens          | ✅      | ✅             |
+| top_p               | ✅      | ✅             |
+| stop                | ✅      | ✅             |
+| seed                | ❌      | ✅             |
+| frequency_penalty   | ❌      | ✅             |
+| presence_penalty    | ❌      | ✅             |
+| best_of             | ❌      | ❌             |
+| n                   | ❌      | ❌             |
+| logprobs            | ❌      | ❌             |
+| top_logprobs        | ❌      | ❌             |
+| logit_bias          | ❌      | ✅             |
+| response_format     | ❌      | ✅             |
+| tools               | ❌      | ✅             |
+| tool_choice         | ❌      | ❌             |
+| suffix              | ❌      | ❌             |
+| echo                | ❌      | ❌             |
+Some additional llama.cpp- and gpt4all specific parameters are supported:
+| Non-spec params     | gpt4all | node-llama-cpp |
+| ------------------- | ------- | -------------- |
+| top_k               | ✅      | ✅             |
+| min_p               | ✅      | ✅             |
+| repeat_penalty_num  | ✅      | ✅             |
+| repeat_penalty      | ✅      | -              |
+#### Functionality
+| Feature               | gpt4all | node-llama-cpp |
+| --------------------- | ------- | -------------- |
+| Streaming             | ✅      | ✅             |
+| Chat context cache    | ✅      | ✅             |
+| System prompt         | ✅      | ✅             |
+| Grammar               | ❌      | ✅             |
+| Function Calling      | ❌      | ✅             |
+#### Usage
+```js lllms.js
+import { startHTTPServer } from 'lllms'
+// Starts a http server for up to two instances of phi3 and serves them via openai API.
+// startHTTPServer is only a thin wrapper around the ModelServer class that spawns a web server.
+startHTTPServer({
+  concurrency: 2,
+  models: {
+    'phi3-mini-4k': {
+      task: 'text-completion',
+      engine: 'node-llama-cpp',
+      url: 'https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf',
+      preload: {
+        // Note that for preloading to be utilized, requests must
+        // also have these leading messages before the user message.
+        messages: [
+          {
+            role: 'system',
+            content: 'You are the Batman.',
+          },
+        ],
+      },
+      // Use these to control resource usage.
+      contextSize: 1024, // Maximum context size. Will be determined automatically if not set.
+      maxInstances: 2, // How many active sessions you wanna be able to cache at the same time.
+      minInstances: 1, // To always keep at least one instance ready. Defaults to 0.
+      ttl: 300, // Idle sessions will be disposed after this many seconds.
+      // Set defaults for completions. These can be overridden per request.
+      // If unset, default values depend on the engine.
+      completionDefaults: {
+        temperature: 1,
+      },
+    },
+  },
+  // HTTP listen options. If you don't need a web server, use `startModelServer` or `new ModelServer()`.
+  // Apart from `listen` they take the same configuration.
+  listen: {
+    port: 3000,
+  },
+})
+// During download requests to a model will stall to get processed once the model is ready.
+// http://localhost:3000 will serve a JSON of the current state of the server.
+```
+```sh
+$ curl http://localhost:3000/openai/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "phi3-mini-4k",
+      "messages": [
+          {
+              "role": "system",
+              "content": "You are the Batman."
+          },
+          {
+              "role": "user",
+              "content": "im robin, lets count to 10!"
+          }
+      ]
+  }'
+```
+```json
+{
+  "id": "phi3-mini-4k:pfBGvlYg-z6dPZUn9",
+  "model": "phi3-mini-4k",
+  "object": "chat.completion",
+  "created": 1720412918,
+  "system_fingerprint": "b38af554bea1fb9867db54ebeff59d0590c5ce48",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "Hello, Robin! As Batman, my focus is on protecting Gotham City and ensuring justice prevails. However, let's have a quick exercise to lighten the mood. Ready?\n\n1... 2... 3... 4... 5... 6... 7... 8... 9... And 10! Great job!\n\nRemember, my mission as Batman never ends, but it's always good to recharge and have fun alongside our partners. Let's keep Gotham safe together."
+      },
+      "logprobs": null,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 12,
+    "completion_tokens": 118,
+    "total_tokens": 130
+  }
+}
+```

package/examples/all-options.js ADDED Viewed

@@ -0,0 +1,108 @@
+import { startHTTPServer } from '../dist/http.js'
+// Starts a http server for up to two instances of phi3 and serves them via openai API.
+// startHTTPServer is only a thin wrapper around the ModelServer class that spawns a web server.
+startHTTPServer({
+	log: 'info', // 'debug', 'info', 'warn', 'error' - or pass a function as custom logger.
+	// Limit how many instances may be handed out concurrently for processing.
+	// If its exceeded, requests will be queued up and stall until a model becomes available.
+	// Defaults to 1 = process one request at a time.
+	concurrency: 2,
+	// Where to cache to disk. Defaults to `~/.cache/node/inference-server`
+	// cachePath: '/path/to/cache',
+	models: {
+		// Specify as many models as you want. Identifiers can use a-zA-Z0-9_:\-\.
+		// Required are `task`, `engine`, `url` and/or `file`.
+		'my-model': {
+			task: 'text-completion', // 'text-completion', 'embedding', 'image-to-text', 'speech-to-text'
+			engine: 'node-llama-cpp', // 'node-llama-cpp', 'transformers-js', 'gpt4all'
+			// Model weights may be specified by file and/or url.
+			url: 'https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/blob/main/smollm-135m-instruct-add-basics-q8_0.gguf',
+			// specify sha256 hash to verify the downloaded file.
+			sha256: 'a98d3857b95b96c156d954780d28f39dcb35b642e72892ee08ddff70719e6220',
+			// The preparation process downloads and verifies model files before instantiating the model.
+			// Use this to control when that happens. Options are:
+			// - 'on-demand' = prepare on first request. This is the default.
+			// - 'blocking' = prepare immediately on startup
+			// - 'async' = prepare in background but don't block startup. Requests to the model during the preparation process will resolve once its ready.
+			// Note that if minInstances > 0 then this is effectively always "blocking" because the model preparation will happen immediately.
+			prepare: 'on-demand',
+			// What should be preloaded in context, for text completion / chat models.
+			preload: {
+				// Note that for preloading to be utilized, requests must
+				// also have these leading messages before the user message.
+				messages: [
+					{
+						role: 'system',
+						content: 'You are a helpful assistant.',
+					},
+				],
+				// toolDocumentation: true, // Tool docs may also be preloaded. See `tools` below.
+			},
+			// Options to control resource usage.
+			contextSize: 2046, // Maximum context size. Will be determined automatically if not set.
+			maxInstances: 2, // How many active sessions you wanna be able to cache at the same time.
+			minInstances: 1, // To always keep at least one instance ready. Defaults to 0.
+			// Idle instances will be disposed after this many seconds.
+			ttl: 300, // Defaults to 5min. Set it to zero to immediately dispose of instances after use.
+			// Set defaults for completions. These can be overridden per request.
+			// If unset, default values depend on the engine.
+			completionDefaults: {
+				temperature: 1,
+			},
+			// Configure hardware / device to use.
+			device: {
+				// GPU will be used automatically if left unset.
+				// Only one model can use the gpu at a time.
+				// gpu: true, // Force gpu use for instance of this model. (This effectively limits maxInstance to 1.)
+				// cpuThreads: 4, // Only gpt4all and node-llama-cpp
+				// memLock: true, // Only node-llama-cpp.
+			},
+			// node-llama-cpp text-completion models may have GBNF grammars and tools configured.
+			// You can define multiple grammars for a model. `json` grammar will alway be available.
+			// Key is the grammar name (that later can be used as value for `grammar` in a request). Value is a string containing the GBNF grammar.
+			grammars: {
+				// For example:
+				// 'custom-grammar': fs.readFileSync('custom-grammar.gbnf', 'utf8'), // Supply your own grammar
+				// 'chess': await LlamaGrammar.getFor(llama, 'chess') // Or reuse a grammar shipped with (node-)llama-cpp
+			},
+			// Avilable tools may be defined on the model or during requests.
+			// Note that for using `preload` with `toolDocumentation` they _must_ be defined here (on the model).
+			tools: {
+				includeParamsDocumentation: true, // Include parameter documentation in tool documentation.
+				parallelism: 2, // How many tools may be executed in parallel. Defaults to 1.
+				definitions: {
+					getLocationWeather: {
+						description: 'Get the weather in a location',
+						parameters: {
+							type: 'object',
+							properties: {
+								location: {
+									type: 'string',
+									description: 'The city and state, e.g. San Francisco, CA',
+								},
+								unit: {
+									type: 'string',
+									enum: ['celsius', 'fahrenheit'],
+								},
+							},
+							required: ['location'],
+						},
+						// Handler is optional. If its set, the model will ingest the return value and respond with the final assistant message.
+						// If unset the model will respond with a tool call message instead. In this case you need to push tool call results into the message array.
+						handler: async (parameters) => {
+							const { location, unit } = parameters
+							// Call a weather API or something
+							return `The temperature in ${location} is 23°C`
+						},
+					}
+				}
+			}
+		},
+	},
+	// HTTP listen options. If you don't need a web server, use `startModelServer` or `new ModelServer()`.
+	// Accepted arguments are identical, apart from `listen`.
+	listen: {
+		port: 3000,
+	},
+})

package/examples/chat-cli.js ADDED Viewed

@@ -0,0 +1,56 @@
+import readline from 'node:readline'
+import chalk from 'chalk'
+import { ModelServer } from '#package/index.js'
+// A command-line chat example using the ModelServer.
+const modelServer = new ModelServer({
+	// log: 'info',
+	models: {
+		'my-model': {
+			task: 'text-completion',
+			minInstances: 1,
+			url: 'https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/blob/main/smollm-135m-instruct-add-basics-q8_0.gguf',
+			sha256: 'a98d3857b95b96c156d954780d28f39dcb35b642e72892ee08ddff70719e6220',
+			engine: 'node-llama-cpp',
+			// device: { gpu: false },
+		},
+	},
+})
+console.log('Initializing models...')
+await modelServer.start()
+const rl = readline.createInterface({
+	input: process.stdin,
+	output: process.stdout,
+})
+const messages = []
+while (true) {
+	const input = await new Promise((resolve) => {
+		rl.question(chalk.bold(chalk.dim('user > ')), (input) => {
+			resolve(input)
+		})
+	})
+	messages.push({
+		role: 'user',
+		content: input,
+	})
+	process.stdout.write(chalk.bold(chalk.dim('model > ')))
+	const result = await modelServer.processChatCompletionTask(
+		{
+			model: 'my-model',
+			messages,
+		},
+		{
+			onChunk: (chunk) => {
+				process.stdout.write(chunk.text)
+			},
+		},
+	)
+	messages.push(result.message)
+	process.stdout.write(' ' + chalk.dim(`[${result.finishReason}]`) + '\n')
+}

package/examples/chat-server.js ADDED Viewed

@@ -0,0 +1,65 @@
+import http from 'node:http'
+import { startModelServer } from '#package/index.js'
+// A minimal chat server using the ModelServer.
+const modelServer = await startModelServer({
+	log: 'info',
+	concurrency: 2,
+	models: {
+		'phi3-mini-4k': {
+			task: 'text-completion',
+			url: 'https://gpt4all.io/models/gguf/Phi-3-mini-4k-instruct.Q4_0.gguf',
+			engine: 'gpt4all',
+			maxInstances: 2,
+		},
+	},
+})
+const httpServer = http.createServer((req, res) => {
+	if (req.url === '/chat' && req.method === 'POST') {
+		let body = ''
+		req.on('data', (chunk) => {
+			body += chunk.toString()
+		})
+		req.on('end', async () => {
+			const req = JSON.parse(body)
+			const completion = await modelServer.processChatCompletionTask(req)
+			res.writeHead(200, { 'Content-Type': 'application/json' })
+			res.end(JSON.stringify(completion, null, 2))
+		})
+	} else {
+		res.writeHead(404, { 'Content-Type': 'text/plain' })
+		res.end('Not found')
+	}
+})
+httpServer.listen(3000).on('listening', () => {
+	console.log('HTTP Server up')
+})
+/*
+curl http://localhost:3000/chat \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "phi3-mini-4k",
+      "messages": [
+          {
+              "role": "user",
+              "content": "how to find my kernel version on linux=?"
+          }
+      ]
+  }'
+*/
+/*
+{
+  "finishReason": "eogToken",
+  "message": {
+    "role": "assistant",
+    "content": "To find your kernel version on Linux, you can use the following methods: [...]"
+  },
+  "promptTokens": 10,
+  "completionTokens": 344,
+  "totalTokens": 354
+}
+*/

package/examples/concurrency.js ADDED Viewed

@@ -0,0 +1,70 @@
+import { startHTTPServer } from '#package/http.js'
+import OpenAI from 'openai'
+import readline from 'node:readline'
+// Printing two parallel completion processes to the console.
+const httpServer = await startHTTPServer({
+	listen: { port: 3000 },
+	concurrency: 2, // two clients may process chat completions at the same time.
+	models: {
+		'my-model': {
+			task: 'text-completion',
+			engine: 'node-llama-cpp',
+			url: 'https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/blob/main/smollm-135m-instruct-add-basics-q8_0.gguf',
+			sha256: 'a98d3857b95b96c156d954780d28f39dcb35b642e72892ee08ddff70719e6220',
+			minInstances: 1, // one instance / session will always be ready
+			maxInstances: 2, // up to two may be spawned
+			device: { gpu: false, cpuThreads: 4 }, // configure so they're roughly the same speed
+		},
+	},
+})
+const openai = new OpenAI({
+	baseURL: 'http://localhost:3000/openai/v1/',
+	apiKey: 'yes',
+})
+let sentence1 = 'Sometimes I feel like'
+let sentence2 = 'The locality of'
+const clearLine = () => {
+	readline.cursorTo(process.stdout, 0)
+	readline.clearLine(process.stdout, 0)
+}
+const updateOutputs = () => {
+	const truncateLine = (line) => {
+		return line.length > process.stdout.columns
+			? '...' + line.slice(line.length - process.stdout.columns + 3)
+			: line
+	}
+	readline.moveCursor(process.stdout, 0, -2)
+	clearLine()
+	process.stdout.write(truncateLine(sentence1) + '\n')
+	clearLine()
+	process.stdout.write(truncateLine(sentence2) + '\n')
+}
+const completeSentence = async (prompt, onTokens) => {
+	const completion = await openai.completions.create({
+		stream_options: { include_usage: true },
+		model: 'my-model',
+		stream: true,
+		temperature: 1,
+		stop: ['.'],
+		prompt,
+	})
+	for await (const chunk of completion) {
+		if (chunk.choices[0].text) {
+			onTokens(chunk.choices[0].text.replaceAll('\n', '\\n'))
+		}
+	}
+	onTokens('.')
+}
+setInterval(updateOutputs, 200)
+console.log(sentence1)
+console.log(sentence2)
+while (true) {
+	await Promise.all([
+		completeSentence(sentence1, (text) => (sentence1 += text)),
+		completeSentence(sentence2, (text) => (sentence2 += text)),
+	])
+}
+httpServer.close()
+clearInterval(updateOutputs)

package/examples/express.js ADDED Viewed

@@ -0,0 +1,70 @@
+import http from 'node:http'
+import express from 'express'
+import OpenAI from 'openai'
+import { ModelServer } from '#package/server.js'
+import { createExpressMiddleware } from '#package/http.js'
+// Demonstration of using the ModelServer + Express middleware to serve an OpenAI API.
+// Create a server with a single model, limiting to 2 instances that can run concurrently.
+// Models will be downloaded on-demand or during ModelServer.start() if minInstances > 0.
+const modelServer = new ModelServer({
+	concurrency: 2,
+	models: {
+		'my-model': {
+			task: 'text-completion',
+			url: 'https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/blob/main/smollm-135m-instruct-add-basics-q8_0.gguf',
+			sha256: 'a98d3857b95b96c156d954780d28f39dcb35b642e72892ee08ddff70719e6220',
+			engine: 'node-llama-cpp',
+			maxInstances: 2,
+		},
+	},
+})
+await modelServer.start()
+const app = express()
+app.use(express.json(), createExpressMiddleware(modelServer))
+const server = http.createServer(app)
+server.listen(3001)
+console.log('Server up, sending chat completion request...')
+const openai = new OpenAI({
+	baseURL: 'http://localhost:3001/openai/v1/',
+	apiKey: '123',
+})
+const completion = await openai.chat.completions.create({
+	model: 'my-model',
+	messages: [{ role: 'user', content: 'Lets count to three!' }],
+	stop: ['Two'],
+})
+console.log(JSON.stringify(completion, null, 2))
+/*
+{
+  "id": "my-model:pU2BHWUv-kHdAeVn8",
+  "model": "my-model",
+  "object": "chat.completion",
+  "created": 1714431837,
+  "system_fingerprint": "0159c68a067a360e4be3e285d3e309440c070734",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "Sure, let's count together: 1 (one), 2 (two), and 3 (three). If you have any other questions or need further assistance, feel free to ask!"
+      },
+      "logprobs": null,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 6,
+    "completion_tokens": 41,
+    "total_tokens": 47
+  }
+}
+*/

package/examples/pool.js ADDED Viewed

@@ -0,0 +1,91 @@
+import os from 'node:os'
+import path from 'node:path'
+import chalk from 'chalk'
+import { ModelPool } from '#package/index.js'
+import { elapsedMillis } from '#package/lib/util.js'
+import * as LlamaCppEngine from '#package/engines/node-llama-cpp/engine.js'
+// Complete multiple prompts concurrently using ModelPool.
+async function onPrepareInstance(instance) {
+	// can be used to set up the instance before it's used.
+	// the model will not be loaded until this promise resolves.
+	// console.log('Instance about to load:', instance)
+	// throwing here will put the instance in an error state
+}
+const pool = new ModelPool(
+	{
+		// to see what's going on, set the log level to 'debug'
+		// log: 'debug',
+		// global processing concurrency limit, across all instances of all models
+		concurrency: 2,
+		models: {
+			'my-model': {
+				task: 'text-completion',
+				// note that this path needs to be absolute and the file needs to be downloaded beforehand.
+				location: path.resolve(
+					os.homedir(),
+					'.cache/inference-server/huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF-main/smollm-135m-instruct-add-basics-q8_0.gguf',
+				),
+				engine: 'node-llama-cpp',
+				minInstances: 1, // setting this to something greater 0 will load the model on pool.init()
+				maxInstances: 2, // allow the pool to spawn additional instances of this model
+			},
+		},
+	},
+	onPrepareInstance,
+)
+process.on('exit', () => {
+	pool.dispose()
+})
+console.log('Initializing pool...')
+await pool.init({
+	'node-llama-cpp': LlamaCppEngine,
+})
+async function createCompletion(prompt) {
+	const req = {
+		model: 'my-model',
+		prompt,
+		temperature: 3,
+		maxTokens: 200,
+	}
+	const completionModel = await pool.requestInstance(req)
+	const completionBegin = process.hrtime.bigint()
+	const task = completionModel.instance.processTextCompletionTask(req)
+	const result = await task.result
+	completionModel.release()
+	const elapsed = Math.max(elapsedMillis(completionBegin), 1000)
+	return {
+		text: result.text,
+		instance: completionModel.instance.id,
+		device: completionModel.instance.gpu ? 'GPU' : 'CPU',
+		speed: Math.round(result.completionTokens / (elapsed / 1000)),
+	}
+}
+const printResult = (title) => (result) => {
+	console.log(
+		chalk.yellow(title) +
+			' ' +
+			chalk.bold(result.instance) +
+			' ' +
+			chalk.dim(`generated ${result.speed} tokens/s on ${result.device}`),
+	)
+	console.log(chalk.dim(prompt) + result.text)
+}
+const completionCount = 20
+const prompt = 'Locality of '
+const res = await createCompletion(prompt)
+printResult('Initial completion')(res)
+console.log(`Processing ${completionCount} completions...`)
+for (let i = 1; i <= completionCount; i++) {
+	createCompletion(prompt).then(printResult(`#${i}`))
+}