genai-lite 0.3.3 β 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +421 -51
- package/dist/index.d.ts +5 -0
- package/dist/index.js +8 -1
- package/dist/llm/LLMService.d.ts +29 -2
- package/dist/llm/LLMService.js +67 -36
- package/dist/llm/clients/LlamaCppClientAdapter.d.ts +116 -0
- package/dist/llm/clients/LlamaCppClientAdapter.js +289 -0
- package/dist/llm/clients/LlamaCppServerClient.d.ts +161 -0
- package/dist/llm/clients/LlamaCppServerClient.js +192 -0
- package/dist/llm/config.d.ts +12 -0
- package/dist/llm/config.js +81 -4
- package/dist/llm/services/ModelResolver.js +13 -13
- package/dist/llm/services/SettingsManager.js +17 -11
- package/dist/llm/types.d.ts +87 -22
- package/dist/prompting/parser.d.ts +2 -2
- package/dist/prompting/parser.js +2 -2
- package/dist/providers/fromEnvironment.d.ts +4 -0
- package/dist/providers/fromEnvironment.js +8 -0
- package/package.json +1 -1
- package/dist/llm/LLMService.createMessages.test.d.ts +0 -4
- package/dist/llm/LLMService.createMessages.test.js +0 -364
- package/dist/llm/LLMService.original.d.ts +0 -147
- package/dist/llm/LLMService.original.js +0 -656
- package/dist/llm/LLMService.prepareMessage.test.d.ts +0 -1
- package/dist/llm/LLMService.prepareMessage.test.js +0 -303
- package/dist/llm/LLMService.presets.test.d.ts +0 -1
- package/dist/llm/LLMService.presets.test.js +0 -210
- package/dist/llm/LLMService.sendMessage.preset.test.d.ts +0 -1
- package/dist/llm/LLMService.sendMessage.preset.test.js +0 -153
- package/dist/llm/LLMService.test.d.ts +0 -1
- package/dist/llm/LLMService.test.js +0 -620
- package/dist/llm/clients/AnthropicClientAdapter.test.d.ts +0 -1
- package/dist/llm/clients/AnthropicClientAdapter.test.js +0 -273
- package/dist/llm/clients/GeminiClientAdapter.test.d.ts +0 -1
- package/dist/llm/clients/GeminiClientAdapter.test.js +0 -405
- package/dist/llm/clients/MockClientAdapter.test.d.ts +0 -1
- package/dist/llm/clients/MockClientAdapter.test.js +0 -250
- package/dist/llm/clients/OpenAIClientAdapter.test.d.ts +0 -1
- package/dist/llm/clients/OpenAIClientAdapter.test.js +0 -258
- package/dist/llm/clients/adapterErrorUtils.test.d.ts +0 -1
- package/dist/llm/clients/adapterErrorUtils.test.js +0 -123
- package/dist/llm/config.test.d.ts +0 -1
- package/dist/llm/config.test.js +0 -176
- package/dist/llm/services/AdapterRegistry.test.d.ts +0 -1
- package/dist/llm/services/AdapterRegistry.test.js +0 -239
- package/dist/llm/services/ModelResolver.test.d.ts +0 -1
- package/dist/llm/services/ModelResolver.test.js +0 -158
- package/dist/llm/services/PresetManager.test.d.ts +0 -1
- package/dist/llm/services/PresetManager.test.js +0 -210
- package/dist/llm/services/RequestValidator.test.d.ts +0 -1
- package/dist/llm/services/RequestValidator.test.js +0 -159
- package/dist/llm/services/SettingsManager.test.d.ts +0 -1
- package/dist/llm/services/SettingsManager.test.js +0 -266
- package/dist/prompting/builder.d.ts +0 -38
- package/dist/prompting/builder.js +0 -63
- package/dist/prompting/builder.test.d.ts +0 -4
- package/dist/prompting/builder.test.js +0 -109
- package/dist/prompting/content.test.d.ts +0 -4
- package/dist/prompting/content.test.js +0 -212
- package/dist/prompting/parser.test.d.ts +0 -4
- package/dist/prompting/parser.test.js +0 -464
- package/dist/prompting/template.test.d.ts +0 -1
- package/dist/prompting/template.test.js +0 -250
- package/dist/providers/fromEnvironment.test.d.ts +0 -1
- package/dist/providers/fromEnvironment.test.js +0 -46
package/README.md
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# genai-lite
|
|
2
2
|
|
|
3
|
-
A lightweight, portable Node.js/TypeScript library providing a unified interface for interacting with multiple Generative AI providers (OpenAI, Anthropic, Google Gemini, Mistral
|
|
3
|
+
A lightweight, portable Node.js/TypeScript library providing a unified interface for interacting with multiple Generative AI providersβboth cloud-based (OpenAI, Anthropic, Google Gemini, Mistral) and local (llama.cpp).
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
7
|
- π **Unified API** - Single interface for multiple AI providers
|
|
8
|
+
- π **Local & Cloud Models** - Run models locally with llama.cpp or use cloud APIs
|
|
8
9
|
- π **Flexible API Key Management** - Bring your own key storage solution
|
|
9
10
|
- π¦ **Zero Electron Dependencies** - Works in any Node.js environment
|
|
10
11
|
- π― **TypeScript First** - Full type safety and IntelliSense support
|
|
@@ -21,13 +22,14 @@ npm install genai-lite
|
|
|
21
22
|
|
|
22
23
|
## Quick Start
|
|
23
24
|
|
|
25
|
+
### Cloud Providers (OpenAI, Anthropic, Gemini, Mistral)
|
|
26
|
+
|
|
24
27
|
```typescript
|
|
25
28
|
import { LLMService, fromEnvironment } from 'genai-lite';
|
|
26
29
|
|
|
27
30
|
// Create service with environment variable API key provider
|
|
28
31
|
const llmService = new LLMService(fromEnvironment);
|
|
29
32
|
|
|
30
|
-
// Option 1: Direct message sending
|
|
31
33
|
const response = await llmService.sendMessage({
|
|
32
34
|
providerId: 'openai',
|
|
33
35
|
modelId: 'gpt-4.1-mini',
|
|
@@ -37,26 +39,47 @@ const response = await llmService.sendMessage({
|
|
|
37
39
|
]
|
|
38
40
|
});
|
|
39
41
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
if (response.object === 'chat.completion') {
|
|
43
|
+
console.log(response.choices[0].message.content);
|
|
44
|
+
} else {
|
|
45
|
+
console.error('Error:', response.error.message);
|
|
46
|
+
}
|
|
47
|
+
```
|
|
46
48
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
### Local Models (llama.cpp)
|
|
50
|
+
|
|
51
|
+
```typescript
|
|
52
|
+
import { LLMService } from 'genai-lite';
|
|
53
|
+
|
|
54
|
+
// Start llama.cpp server first: llama-server -m /path/to/model.gguf --port 8080
|
|
55
|
+
const llmService = new LLMService(async () => 'not-needed');
|
|
56
|
+
|
|
57
|
+
const response = await llmService.sendMessage({
|
|
58
|
+
providerId: 'llamacpp',
|
|
59
|
+
modelId: 'llama-3-8b-instruct', // Must match your loaded model
|
|
60
|
+
messages: [
|
|
61
|
+
{ role: 'system', content: 'You are a helpful assistant.' },
|
|
62
|
+
{ role: 'user', content: 'Explain quantum computing briefly.' }
|
|
63
|
+
]
|
|
51
64
|
});
|
|
52
65
|
|
|
53
66
|
if (response.object === 'chat.completion') {
|
|
54
67
|
console.log(response.choices[0].message.content);
|
|
55
|
-
} else {
|
|
56
|
-
console.error('Error:', response.error.message);
|
|
57
68
|
}
|
|
58
69
|
```
|
|
59
70
|
|
|
71
|
+
See the [llama.cpp Integration](#llamacpp-integration) section for setup details.
|
|
72
|
+
|
|
73
|
+
## Example Application
|
|
74
|
+
|
|
75
|
+
For a complete, production-ready example showcasing all genai-lite capabilities, see the **[chat-demo](examples/chat-demo)** interactive web application. The demo includes:
|
|
76
|
+
- Multi-provider chat interface with all supported providers
|
|
77
|
+
- Template rendering and model presets
|
|
78
|
+
- llama.cpp utilities (tokenization, embeddings, health checks)
|
|
79
|
+
- Settings persistence, export/import features
|
|
80
|
+
|
|
81
|
+
The chat-demo serves as both a comprehensive showcase and a quick-test environment for library changes.
|
|
82
|
+
|
|
60
83
|
## API Key Management
|
|
61
84
|
|
|
62
85
|
genai-lite uses a flexible API key provider pattern. You can use the built-in environment variable provider or create your own:
|
|
@@ -124,6 +147,64 @@ const llmService = new LLMService(myKeyProvider);
|
|
|
124
147
|
- `codestral-2501` - Specialized for code generation
|
|
125
148
|
- `devstral-small-2505` - Compact development-focused model
|
|
126
149
|
|
|
150
|
+
### llama.cpp (Local Models)
|
|
151
|
+
|
|
152
|
+
Run models locally via [llama.cpp](https://github.com/ggml-org/llama.cpp) server. Model IDs can be any nameβthey're not validated since you load your own GGUF models.
|
|
153
|
+
|
|
154
|
+
**Example models:**
|
|
155
|
+
- `llama-3-8b-instruct` - Llama 3 8B Instruct
|
|
156
|
+
- `llama-3-70b-instruct` - Llama 3 70B Instruct
|
|
157
|
+
- `mistral-7b-instruct` - Mistral 7B Instruct
|
|
158
|
+
- `my-custom-model` - Any custom model you've loaded
|
|
159
|
+
|
|
160
|
+
**Setup:**
|
|
161
|
+
|
|
162
|
+
1. Start llama.cpp server with your model:
|
|
163
|
+
```bash
|
|
164
|
+
llama-server -m /path/to/model.gguf --port 8080
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
2. Use with genai-lite (no API key needed):
|
|
168
|
+
```typescript
|
|
169
|
+
import { LLMService } from 'genai-lite';
|
|
170
|
+
|
|
171
|
+
// API key can be any string for llama.cpp
|
|
172
|
+
const service = new LLMService(async () => 'not-needed');
|
|
173
|
+
|
|
174
|
+
const response = await service.sendMessage({
|
|
175
|
+
providerId: 'llamacpp',
|
|
176
|
+
modelId: 'llama-3-8b-instruct', // Must match your loaded model name
|
|
177
|
+
messages: [{ role: 'user', content: 'Hello!' }]
|
|
178
|
+
});
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
3. Configure server URL via environment variable:
|
|
182
|
+
```bash
|
|
183
|
+
export LLAMACPP_API_BASE_URL=http://localhost:8080
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**Advanced features** - Access non-LLM endpoints:
|
|
187
|
+
|
|
188
|
+
```typescript
|
|
189
|
+
import { LlamaCppServerClient } from 'genai-lite';
|
|
190
|
+
|
|
191
|
+
const client = new LlamaCppServerClient('http://localhost:8080');
|
|
192
|
+
|
|
193
|
+
// Check server health
|
|
194
|
+
const health = await client.getHealth();
|
|
195
|
+
|
|
196
|
+
// Tokenize text
|
|
197
|
+
const { tokens } = await client.tokenize('Hello world');
|
|
198
|
+
|
|
199
|
+
// Generate embeddings
|
|
200
|
+
const { embedding } = await client.createEmbedding('Some text');
|
|
201
|
+
|
|
202
|
+
// Code completion
|
|
203
|
+
const result = await client.infill('def hello():\n', '\nprint("done")');
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
See the [llama.cpp Integration](#llamacpp-integration) section for details.
|
|
207
|
+
|
|
127
208
|
### Models with Reasoning Support
|
|
128
209
|
|
|
129
210
|
Some models include advanced reasoning/thinking capabilities that enhance their problem-solving abilities:
|
|
@@ -214,9 +295,11 @@ if (response.object === 'chat.completion' && response.choices[0].reasoning) {
|
|
|
214
295
|
- Not all models support reasoning - check the [supported models](#models-with-reasoning-support) list
|
|
215
296
|
- The `reasoning` field in the response contains the model's thought process (when available)
|
|
216
297
|
|
|
217
|
-
###
|
|
298
|
+
### Thinking Extraction and Enforcement
|
|
299
|
+
|
|
300
|
+
For models without native reasoning, you can prompt them to output reasoning in XML tags like `<thinking>`. The library then extracts these tags and moves the content to the standardized `reasoning` field, providing a consistent interface across all models.
|
|
218
301
|
|
|
219
|
-
|
|
302
|
+
**Key point:** The library doesn't make models think automaticallyβyou must explicitly instruct non-reasoning models to use thinking tags in your prompt. The library then enforces that these tags are present (for non-reasoning models) or accepts native reasoning (for reasoning models).
|
|
220
303
|
|
|
221
304
|
```typescript
|
|
222
305
|
// Prompt the model to think step-by-step in a <thinking> tag
|
|
@@ -231,7 +314,7 @@ const response = await llmService.sendMessage({
|
|
|
231
314
|
content: 'Please think through this problem step by step before answering: What is 15% of 240?'
|
|
232
315
|
}],
|
|
233
316
|
settings: {
|
|
234
|
-
|
|
317
|
+
thinkingTagFallback: { enabled: true } // Must explicitly enable
|
|
235
318
|
}
|
|
236
319
|
});
|
|
237
320
|
|
|
@@ -253,25 +336,25 @@ const response = await llmService.sendMessage({
|
|
|
253
336
|
modelId: 'claude-3-5-haiku-20241022',
|
|
254
337
|
messages: [{ role: 'user', content: 'Solve this step by step...' }],
|
|
255
338
|
settings: {
|
|
256
|
-
|
|
339
|
+
thinkingTagFallback: {
|
|
257
340
|
enabled: true, // Must explicitly enable (default: false)
|
|
258
|
-
|
|
259
|
-
|
|
341
|
+
tagName: 'scratchpad', // Custom tag name (default: 'thinking')
|
|
342
|
+
enforce: true // Smart enforcement (see below)
|
|
260
343
|
}
|
|
261
344
|
}
|
|
262
345
|
});
|
|
263
346
|
```
|
|
264
347
|
|
|
265
|
-
**The `
|
|
348
|
+
**The `enforce` Property:**
|
|
266
349
|
|
|
267
|
-
The `
|
|
350
|
+
The `enforce` boolean controls whether thinking tags are required when native reasoning is not active:
|
|
268
351
|
|
|
269
|
-
- `
|
|
270
|
-
- `
|
|
271
|
-
- `'error'`: Return an error response with the original response preserved in `partialResponse`
|
|
272
|
-
- `'auto'` (default): Intelligently decide based on the model's native reasoning capabilities
|
|
352
|
+
- `enforce: true` - Error if tags missing AND native reasoning not active (smart enforcement)
|
|
353
|
+
- `enforce: false` (default) - Extract tags if present, never error
|
|
273
354
|
|
|
274
|
-
**
|
|
355
|
+
The enforcement is **always smart** - it automatically checks if native reasoning is active and only enforces when the model needs tags as a fallback.
|
|
356
|
+
|
|
357
|
+
**How Smart Enforcement Works:**
|
|
275
358
|
|
|
276
359
|
```typescript
|
|
277
360
|
// With non-native reasoning models (e.g., GPT-4)
|
|
@@ -286,10 +369,10 @@ const response = await llmService.sendMessage({
|
|
|
286
369
|
content: 'What is 15% of 240?'
|
|
287
370
|
}],
|
|
288
371
|
settings: {
|
|
289
|
-
|
|
372
|
+
thinkingTagFallback: { enabled: true, enforce: true }
|
|
290
373
|
}
|
|
291
374
|
});
|
|
292
|
-
// Result: ERROR if <thinking> tag is missing (
|
|
375
|
+
// Result: ERROR if <thinking> tag is missing (native reasoning not active)
|
|
293
376
|
// The response is still accessible via errorResponse.partialResponse
|
|
294
377
|
|
|
295
378
|
// With native reasoning models (e.g., Claude with reasoning enabled)
|
|
@@ -299,10 +382,10 @@ const response = await llmService.sendMessage({
|
|
|
299
382
|
messages: [/* same prompt */],
|
|
300
383
|
settings: {
|
|
301
384
|
reasoning: { enabled: true },
|
|
302
|
-
|
|
385
|
+
thinkingTagFallback: { enabled: true, enforce: true }
|
|
303
386
|
}
|
|
304
387
|
});
|
|
305
|
-
// Result: SUCCESS even if <thinking> tag is missing (
|
|
388
|
+
// Result: SUCCESS even if <thinking> tag is missing (native reasoning is active)
|
|
306
389
|
```
|
|
307
390
|
|
|
308
391
|
This intelligent enforcement ensures that:
|
|
@@ -429,13 +512,10 @@ The library provides a powerful `createMessages` method that combines template r
|
|
|
429
512
|
// Basic example: Create model-aware messages
|
|
430
513
|
const { messages, modelContext } = await llmService.createMessages({
|
|
431
514
|
template: `
|
|
432
|
-
<SYSTEM>
|
|
433
|
-
You are a {{ thinking_enabled ? "thoughtful" : "helpful" }} assistant.
|
|
434
|
-
{{ thinking_available && !thinking_enabled ? "Note: Reasoning mode is available for complex problems." : "" }}
|
|
435
|
-
</SYSTEM>
|
|
515
|
+
<SYSTEM>You are a helpful assistant.</SYSTEM>
|
|
436
516
|
<USER>{{ question }}</USER>
|
|
437
517
|
`,
|
|
438
|
-
variables: {
|
|
518
|
+
variables: {
|
|
439
519
|
question: 'What is the optimal algorithm for finding the shortest path in a weighted graph?'
|
|
440
520
|
},
|
|
441
521
|
presetId: 'anthropic-claude-3-7-sonnet-20250219-thinking'
|
|
@@ -479,14 +559,26 @@ The method provides:
|
|
|
479
559
|
- **Template Rendering**: Full support for conditionals and variable substitution
|
|
480
560
|
- **Role Tag Parsing**: Converts `<SYSTEM>`, `<USER>`, and `<ASSISTANT>` tags to messages
|
|
481
561
|
|
|
482
|
-
Available model context variables
|
|
483
|
-
|
|
484
|
-
- `
|
|
562
|
+
**Available model context variables:**
|
|
563
|
+
|
|
564
|
+
- `native_reasoning_active`: Whether native reasoning is **currently active** for this request
|
|
565
|
+
- `true`: The model is using built-in reasoning (e.g., Claude 4, o4-mini, Gemini 2.5 Pro with reasoning enabled)
|
|
566
|
+
- `false`: No native reasoning is active (either because the model doesn't support it, or it's been disabled)
|
|
567
|
+
- `native_reasoning_capable`: Whether the model **has the capability** to use native reasoning
|
|
568
|
+
- `true`: Model supports native reasoning (may or may not be enabled)
|
|
569
|
+
- `false`: Model does not support native reasoning
|
|
485
570
|
- `model_id`: The resolved model ID
|
|
486
571
|
- `provider_id`: The resolved provider ID
|
|
487
572
|
- `reasoning_effort`: The reasoning effort level if specified
|
|
488
573
|
- `reasoning_max_tokens`: The reasoning token budget if specified
|
|
489
574
|
|
|
575
|
+
**Best Practice for Templates:**
|
|
576
|
+
When adding thinking tag instructions to your templates, **always use `requires_tags_for_thinking`** (the NOT operator). This ensures:
|
|
577
|
+
- Models with active native reasoning get clean, direct prompts
|
|
578
|
+
- Models without native reasoning get explicit instructions to use `<thinking>` tags
|
|
579
|
+
|
|
580
|
+
Example: `{{ requires_tags_for_thinking ? ' Write your reasoning in <thinking> tags first.' : '' }}`
|
|
581
|
+
|
|
490
582
|
#### Advanced Features
|
|
491
583
|
|
|
492
584
|
**Dynamic Role Injection:**
|
|
@@ -536,7 +628,7 @@ const response = await llmService.sendMessage({
|
|
|
536
628
|
modelId: 'gpt-4.1',
|
|
537
629
|
messages,
|
|
538
630
|
settings: {
|
|
539
|
-
|
|
631
|
+
thinkingTagFallback: { enabled: true } // Default, but shown for clarity
|
|
540
632
|
}
|
|
541
633
|
});
|
|
542
634
|
|
|
@@ -559,7 +651,7 @@ const creativeWritingTemplate = `
|
|
|
559
651
|
"settings": {
|
|
560
652
|
"temperature": 0.9,
|
|
561
653
|
"maxTokens": 3000,
|
|
562
|
-
"
|
|
654
|
+
"thinkingTagFallback": { "enabled": true, "tagName": "reasoning" }
|
|
563
655
|
}
|
|
564
656
|
}
|
|
565
657
|
</META>
|
|
@@ -666,6 +758,261 @@ if (response.object === 'error') {
|
|
|
666
758
|
}
|
|
667
759
|
```
|
|
668
760
|
|
|
761
|
+
## llama.cpp Integration
|
|
762
|
+
|
|
763
|
+
`genai-lite` provides comprehensive support for running local LLMs via [llama.cpp](https://github.com/ggml-org/llama.cpp) server, enabling completely offline AI capabilities with the same unified interface.
|
|
764
|
+
|
|
765
|
+
### Why llama.cpp?
|
|
766
|
+
|
|
767
|
+
- **Privacy**: All model inference runs locally on your hardware
|
|
768
|
+
- **Cost**: No API costs after initial model download
|
|
769
|
+
- **Control**: Use any GGUF model from Hugging Face
|
|
770
|
+
- **Performance**: Optimized C++ implementation with hardware acceleration
|
|
771
|
+
|
|
772
|
+
### Setup
|
|
773
|
+
|
|
774
|
+
#### 1. Install llama.cpp
|
|
775
|
+
|
|
776
|
+
```bash
|
|
777
|
+
# Clone and build llama.cpp
|
|
778
|
+
git clone https://github.com/ggml-org/llama.cpp
|
|
779
|
+
cd llama.cpp
|
|
780
|
+
make
|
|
781
|
+
|
|
782
|
+
# Or download pre-built binaries from releases
|
|
783
|
+
```
|
|
784
|
+
|
|
785
|
+
#### 2. Download a Model
|
|
786
|
+
|
|
787
|
+
Get GGUF models from Hugging Face, for example:
|
|
788
|
+
- [Meta-Llama-3.1-8B-Instruct-GGUF](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
|
|
789
|
+
- [Mistral-7B-Instruct-v0.3-GGUF](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
|
|
790
|
+
|
|
791
|
+
#### 3. Start the Server
|
|
792
|
+
|
|
793
|
+
```bash
|
|
794
|
+
# Basic usage
|
|
795
|
+
llama-server -m /path/to/model.gguf --port 8080
|
|
796
|
+
|
|
797
|
+
# With more options
|
|
798
|
+
llama-server -m /path/to/model.gguf \
|
|
799
|
+
--port 8080 \
|
|
800
|
+
-c 4096 \ # Context size
|
|
801
|
+
-np 4 \ # Parallel requests
|
|
802
|
+
--threads 8 # CPU threads
|
|
803
|
+
```
|
|
804
|
+
|
|
805
|
+
### Basic Usage
|
|
806
|
+
|
|
807
|
+
```typescript
|
|
808
|
+
import { LLMService } from 'genai-lite';
|
|
809
|
+
|
|
810
|
+
// llama.cpp doesn't need API keys
|
|
811
|
+
const service = new LLMService(async () => 'not-needed');
|
|
812
|
+
|
|
813
|
+
const response = await service.sendMessage({
|
|
814
|
+
providerId: 'llamacpp',
|
|
815
|
+
modelId: 'llama-3-8b-instruct', // Arbitrary name matching your model
|
|
816
|
+
messages: [
|
|
817
|
+
{ role: 'system', content: 'You are a helpful assistant.' },
|
|
818
|
+
{ role: 'user', content: 'Explain quantum computing in simple terms.' }
|
|
819
|
+
],
|
|
820
|
+
settings: {
|
|
821
|
+
temperature: 0.7,
|
|
822
|
+
maxTokens: 500
|
|
823
|
+
}
|
|
824
|
+
});
|
|
825
|
+
|
|
826
|
+
if (response.object === 'chat.completion') {
|
|
827
|
+
console.log(response.choices[0].message.content);
|
|
828
|
+
}
|
|
829
|
+
```
|
|
830
|
+
|
|
831
|
+
### Configuration
|
|
832
|
+
|
|
833
|
+
#### Environment Variable
|
|
834
|
+
|
|
835
|
+
Set the server URL via environment variable (default: `http://localhost:8080`):
|
|
836
|
+
|
|
837
|
+
```bash
|
|
838
|
+
export LLAMACPP_API_BASE_URL=http://localhost:8080
|
|
839
|
+
```
|
|
840
|
+
|
|
841
|
+
#### Multiple Servers
|
|
842
|
+
|
|
843
|
+
Register multiple llama.cpp instances for different models:
|
|
844
|
+
|
|
845
|
+
```typescript
|
|
846
|
+
import { LLMService, LlamaCppClientAdapter } from 'genai-lite';
|
|
847
|
+
|
|
848
|
+
const service = new LLMService(async () => 'not-needed');
|
|
849
|
+
|
|
850
|
+
// Register adapters for different servers/models
|
|
851
|
+
service.registerAdapter(
|
|
852
|
+
'llamacpp-small',
|
|
853
|
+
new LlamaCppClientAdapter({ baseURL: 'http://localhost:8080' })
|
|
854
|
+
);
|
|
855
|
+
|
|
856
|
+
service.registerAdapter(
|
|
857
|
+
'llamacpp-large',
|
|
858
|
+
new LlamaCppClientAdapter({ baseURL: 'http://localhost:8081' })
|
|
859
|
+
);
|
|
860
|
+
|
|
861
|
+
// Use them
|
|
862
|
+
const response = await service.sendMessage({
|
|
863
|
+
providerId: 'llamacpp-small',
|
|
864
|
+
modelId: 'llama-3-8b',
|
|
865
|
+
messages: [{ role: 'user', content: 'Hello!' }]
|
|
866
|
+
});
|
|
867
|
+
```
|
|
868
|
+
|
|
869
|
+
#### Health Checking
|
|
870
|
+
|
|
871
|
+
Enable automatic health checks before requests:
|
|
872
|
+
|
|
873
|
+
```typescript
|
|
874
|
+
import { LlamaCppClientAdapter } from 'genai-lite';
|
|
875
|
+
|
|
876
|
+
const adapter = new LlamaCppClientAdapter({
|
|
877
|
+
baseURL: 'http://localhost:8080',
|
|
878
|
+
checkHealth: true // Check server status before each request
|
|
879
|
+
});
|
|
880
|
+
|
|
881
|
+
service.registerAdapter('llamacpp', adapter);
|
|
882
|
+
```
|
|
883
|
+
|
|
884
|
+
### Advanced Features
|
|
885
|
+
|
|
886
|
+
#### Server Management
|
|
887
|
+
|
|
888
|
+
The `LlamaCppServerClient` class provides access to all llama.cpp server endpoints:
|
|
889
|
+
|
|
890
|
+
```typescript
|
|
891
|
+
import { LlamaCppServerClient } from 'genai-lite';
|
|
892
|
+
|
|
893
|
+
const client = new LlamaCppServerClient('http://localhost:8080');
|
|
894
|
+
|
|
895
|
+
// Health monitoring
|
|
896
|
+
const health = await client.getHealth();
|
|
897
|
+
console.log(health.status); // 'ok', 'loading', or 'error'
|
|
898
|
+
|
|
899
|
+
// Server properties
|
|
900
|
+
const props = await client.getProps();
|
|
901
|
+
console.log(props.total_slots); // Number of available slots
|
|
902
|
+
|
|
903
|
+
// Performance metrics (if enabled)
|
|
904
|
+
const metrics = await client.getMetrics();
|
|
905
|
+
```
|
|
906
|
+
|
|
907
|
+
#### Tokenization
|
|
908
|
+
|
|
909
|
+
```typescript
|
|
910
|
+
const client = new LlamaCppServerClient('http://localhost:8080');
|
|
911
|
+
|
|
912
|
+
// Tokenize text
|
|
913
|
+
const { tokens } = await client.tokenize('Hello, world!');
|
|
914
|
+
console.log(tokens); // [123, 456, 789]
|
|
915
|
+
|
|
916
|
+
// Count tokens before sending to LLM
|
|
917
|
+
const prompt = 'Long text...';
|
|
918
|
+
const { tokens: promptTokens } = await client.tokenize(prompt);
|
|
919
|
+
if (promptTokens.length > 4000) {
|
|
920
|
+
console.log('Prompt too long, truncating...');
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
// Detokenize back to text
|
|
924
|
+
const { content } = await client.detokenize([123, 456, 789]);
|
|
925
|
+
console.log(content); // 'Hello, world!'
|
|
926
|
+
```
|
|
927
|
+
|
|
928
|
+
#### Text Embeddings
|
|
929
|
+
|
|
930
|
+
```typescript
|
|
931
|
+
const client = new LlamaCppServerClient('http://localhost:8080');
|
|
932
|
+
|
|
933
|
+
// Generate embeddings for semantic search
|
|
934
|
+
const { embedding } = await client.createEmbedding('Search query text');
|
|
935
|
+
console.log(embedding.length); // e.g., 768 dimensions
|
|
936
|
+
|
|
937
|
+
// With images (for multimodal models)
|
|
938
|
+
const { embedding: multimodalEmbed } = await client.createEmbedding(
|
|
939
|
+
'Describe this image',
|
|
940
|
+
'base64_image_data_here'
|
|
941
|
+
);
|
|
942
|
+
```
|
|
943
|
+
|
|
944
|
+
#### Code Infilling
|
|
945
|
+
|
|
946
|
+
Perfect for code completion in IDEs:
|
|
947
|
+
|
|
948
|
+
```typescript
|
|
949
|
+
const client = new LlamaCppServerClient('http://localhost:8080');
|
|
950
|
+
|
|
951
|
+
const result = await client.infill(
|
|
952
|
+
'def calculate_fibonacci(n):\n ', // Prefix (before cursor)
|
|
953
|
+
'\n return result' // Suffix (after cursor)
|
|
954
|
+
);
|
|
955
|
+
|
|
956
|
+
console.log(result.content);
|
|
957
|
+
// Output: "if n <= 1:\n return n\n result = calculate_fibonacci(n-1) + calculate_fibonacci(n-2)"
|
|
958
|
+
```
|
|
959
|
+
|
|
960
|
+
### Error Handling
|
|
961
|
+
|
|
962
|
+
```typescript
|
|
963
|
+
const response = await service.sendMessage({
|
|
964
|
+
providerId: 'llamacpp',
|
|
965
|
+
modelId: 'my-model',
|
|
966
|
+
messages: [{ role: 'user', content: 'Hello' }]
|
|
967
|
+
});
|
|
968
|
+
|
|
969
|
+
if (response.object === 'error') {
|
|
970
|
+
switch (response.error.code) {
|
|
971
|
+
case 'NETWORK_ERROR':
|
|
972
|
+
console.error('Server not running or unreachable');
|
|
973
|
+
break;
|
|
974
|
+
case 'PROVIDER_ERROR':
|
|
975
|
+
console.error('Server error:', response.error.message);
|
|
976
|
+
break;
|
|
977
|
+
default:
|
|
978
|
+
console.error('Unknown error:', response.error);
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
```
|
|
982
|
+
|
|
983
|
+
### Best Practices
|
|
984
|
+
|
|
985
|
+
1. **Model Naming**: Use descriptive model IDs (e.g., `llama-3-8b-instruct`) since llama.cpp accepts any name
|
|
986
|
+
2. **Context Size**: Set appropriate context (`-c` flag) when starting the server
|
|
987
|
+
3. **Parallel Requests**: Configure slots (`-np`) based on your hardware
|
|
988
|
+
4. **Health Monitoring**: Enable `checkHealth` for production to detect server issues early
|
|
989
|
+
5. **Resource Management**: Monitor memory usage; large models need significant RAM
|
|
990
|
+
|
|
991
|
+
### Troubleshooting
|
|
992
|
+
|
|
993
|
+
**Server not responding:**
|
|
994
|
+
```bash
|
|
995
|
+
# Check if server is running
|
|
996
|
+
curl http://localhost:8080/health
|
|
997
|
+
|
|
998
|
+
# Should return: {"status":"ok"}
|
|
999
|
+
```
|
|
1000
|
+
|
|
1001
|
+
**Model loading errors:**
|
|
1002
|
+
```bash
|
|
1003
|
+
# Increase memory or reduce context size
|
|
1004
|
+
llama-server -m model.gguf --port 8080 -c 2048
|
|
1005
|
+
```
|
|
1006
|
+
|
|
1007
|
+
**Slow responses:**
|
|
1008
|
+
```bash
|
|
1009
|
+
# Use quantized models (smaller but faster)
|
|
1010
|
+
# e.g., Q4_K_M, Q5_K_M instead of F16
|
|
1011
|
+
|
|
1012
|
+
# Increase threads
|
|
1013
|
+
llama-server -m model.gguf --threads 16
|
|
1014
|
+
```
|
|
1015
|
+
|
|
669
1016
|
## Using with Electron
|
|
670
1017
|
|
|
671
1018
|
`genai-lite` is designed to work seamlessly within an Electron application's main process, especially when paired with a secure storage solution like `genai-key-storage-lite`.
|
|
@@ -709,14 +1056,14 @@ const llmService = new LLMService(electronKeyProvider);
|
|
|
709
1056
|
genai-lite is written in TypeScript and provides comprehensive type definitions:
|
|
710
1057
|
|
|
711
1058
|
```typescript
|
|
712
|
-
import type {
|
|
1059
|
+
import type {
|
|
713
1060
|
LLMChatRequest,
|
|
714
1061
|
LLMChatRequestWithPreset,
|
|
715
1062
|
LLMResponse,
|
|
716
1063
|
LLMFailureResponse,
|
|
717
1064
|
LLMSettings,
|
|
718
1065
|
LLMReasoningSettings,
|
|
719
|
-
|
|
1066
|
+
LLMThinkingTagFallbackSettings,
|
|
720
1067
|
ApiKeyProvider,
|
|
721
1068
|
ModelPreset,
|
|
722
1069
|
LLMServiceOptions,
|
|
@@ -725,6 +1072,26 @@ import type {
|
|
|
725
1072
|
CreateMessagesResult,
|
|
726
1073
|
TemplateMetadata
|
|
727
1074
|
} from 'genai-lite';
|
|
1075
|
+
|
|
1076
|
+
// llama.cpp integration types and classes
|
|
1077
|
+
import {
|
|
1078
|
+
LlamaCppClientAdapter,
|
|
1079
|
+
LlamaCppServerClient,
|
|
1080
|
+
createFallbackModelInfo
|
|
1081
|
+
} from 'genai-lite';
|
|
1082
|
+
|
|
1083
|
+
import type {
|
|
1084
|
+
LlamaCppClientConfig,
|
|
1085
|
+
LlamaCppHealthResponse,
|
|
1086
|
+
LlamaCppTokenizeResponse,
|
|
1087
|
+
LlamaCppDetokenizeResponse,
|
|
1088
|
+
LlamaCppEmbeddingResponse,
|
|
1089
|
+
LlamaCppInfillResponse,
|
|
1090
|
+
LlamaCppPropsResponse,
|
|
1091
|
+
LlamaCppMetricsResponse,
|
|
1092
|
+
LlamaCppSlot,
|
|
1093
|
+
LlamaCppSlotsResponse
|
|
1094
|
+
} from 'genai-lite';
|
|
728
1095
|
```
|
|
729
1096
|
|
|
730
1097
|
## Utilities
|
|
@@ -968,24 +1335,23 @@ const { messages } = await llmService.createMessages({
|
|
|
968
1335
|
presetId: 'openai-gpt-4.1-default' // Optional: adds model context
|
|
969
1336
|
});
|
|
970
1337
|
|
|
971
|
-
// Advanced:
|
|
1338
|
+
// Advanced: Adaptive prompts based on model capabilities
|
|
972
1339
|
const { messages, modelContext } = await llmService.createMessages({
|
|
973
1340
|
template: `
|
|
974
1341
|
<SYSTEM>
|
|
975
|
-
You are a
|
|
976
|
-
{{
|
|
1342
|
+
You are a problem-solving assistant.
|
|
1343
|
+
{{ requires_tags_for_thinking ? ' For complex problems, write your reasoning in <thinking> tags before answering.' : '' }}
|
|
977
1344
|
</SYSTEM>
|
|
978
|
-
<USER>
|
|
979
|
-
{{ thinking_enabled ? 'Please solve this step-by-step:' : 'Please answer:' }}
|
|
980
|
-
{{ question }}
|
|
981
|
-
</USER>
|
|
1345
|
+
<USER>{{ question }}</USER>
|
|
982
1346
|
`,
|
|
1347
|
+
// Note: Use requires_tags_for_thinking (NOT operator) - only instruct models that don't have active native reasoning
|
|
983
1348
|
variables: { question: 'What causes the seasons on Earth?' },
|
|
984
1349
|
presetId: 'anthropic-claude-3-7-sonnet-20250219-thinking'
|
|
985
1350
|
});
|
|
986
1351
|
|
|
987
1352
|
console.log('Model context:', modelContext);
|
|
988
|
-
// Output: {
|
|
1353
|
+
// Output: { native_reasoning_active: true, native_reasoning_capable: true, model_id: 'claude-3-7-sonnet-20250219', ... }
|
|
1354
|
+
// Note: With a reasoning model, the system prompt won't include thinking tag instructions
|
|
989
1355
|
```
|
|
990
1356
|
|
|
991
1357
|
**Low-Level Utilities:**
|
|
@@ -1106,6 +1472,10 @@ These utilities enable:
|
|
|
1106
1472
|
- **Template Reusability**: Define templates once, use with different variables
|
|
1107
1473
|
- **Type Safety**: Full TypeScript support with LLMMessage types
|
|
1108
1474
|
|
|
1475
|
+
## Examples
|
|
1476
|
+
|
|
1477
|
+
See the **[chat-demo](examples/chat-demo)** application for a complete working example that demonstrates all library features in a production-ready React + Express application.
|
|
1478
|
+
|
|
1109
1479
|
## Contributing
|
|
1110
1480
|
|
|
1111
1481
|
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
package/dist/index.d.ts
CHANGED
|
@@ -5,7 +5,12 @@ export type { ModelPreset } from "./types/presets";
|
|
|
5
5
|
export * from "./llm/types";
|
|
6
6
|
export * from "./llm/clients/types";
|
|
7
7
|
export { fromEnvironment } from "./providers/fromEnvironment";
|
|
8
|
+
export { LlamaCppClientAdapter } from "./llm/clients/LlamaCppClientAdapter";
|
|
9
|
+
export { LlamaCppServerClient } from "./llm/clients/LlamaCppServerClient";
|
|
10
|
+
export type { LlamaCppClientConfig, } from "./llm/clients/LlamaCppClientAdapter";
|
|
11
|
+
export type { LlamaCppHealthResponse, LlamaCppTokenizeResponse, LlamaCppDetokenizeResponse, LlamaCppEmbeddingResponse, LlamaCppInfillResponse, LlamaCppPropsResponse, LlamaCppMetricsResponse, LlamaCppSlot, LlamaCppSlotsResponse, } from "./llm/clients/LlamaCppServerClient";
|
|
8
12
|
export { renderTemplate } from "./prompting/template";
|
|
9
13
|
export { countTokens, getSmartPreview, extractRandomVariables } from "./prompting/content";
|
|
10
14
|
export { parseStructuredContent, parseRoleTags, extractInitialTaggedContent, parseTemplateWithMetadata } from "./prompting/parser";
|
|
11
15
|
export type { TemplateMetadata } from "./prompting/parser";
|
|
16
|
+
export { createFallbackModelInfo } from "./llm/config";
|
package/dist/index.js
CHANGED
|
@@ -14,7 +14,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
15
|
};
|
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
-
exports.parseTemplateWithMetadata = exports.extractInitialTaggedContent = exports.parseRoleTags = exports.parseStructuredContent = exports.extractRandomVariables = exports.getSmartPreview = exports.countTokens = exports.renderTemplate = exports.fromEnvironment = exports.LLMService = void 0;
|
|
17
|
+
exports.createFallbackModelInfo = exports.parseTemplateWithMetadata = exports.extractInitialTaggedContent = exports.parseRoleTags = exports.parseStructuredContent = exports.extractRandomVariables = exports.getSmartPreview = exports.countTokens = exports.renderTemplate = exports.LlamaCppServerClient = exports.LlamaCppClientAdapter = exports.fromEnvironment = exports.LLMService = void 0;
|
|
18
18
|
// --- LLM Service ---
|
|
19
19
|
var LLMService_1 = require("./llm/LLMService");
|
|
20
20
|
Object.defineProperty(exports, "LLMService", { enumerable: true, get: function () { return LLMService_1.LLMService; } });
|
|
@@ -25,6 +25,11 @@ __exportStar(require("./llm/clients/types"), exports);
|
|
|
25
25
|
// --- API Key Providers ---
|
|
26
26
|
var fromEnvironment_1 = require("./providers/fromEnvironment");
|
|
27
27
|
Object.defineProperty(exports, "fromEnvironment", { enumerable: true, get: function () { return fromEnvironment_1.fromEnvironment; } });
|
|
28
|
+
// --- llama.cpp Integration ---
|
|
29
|
+
var LlamaCppClientAdapter_1 = require("./llm/clients/LlamaCppClientAdapter");
|
|
30
|
+
Object.defineProperty(exports, "LlamaCppClientAdapter", { enumerable: true, get: function () { return LlamaCppClientAdapter_1.LlamaCppClientAdapter; } });
|
|
31
|
+
var LlamaCppServerClient_1 = require("./llm/clients/LlamaCppServerClient");
|
|
32
|
+
Object.defineProperty(exports, "LlamaCppServerClient", { enumerable: true, get: function () { return LlamaCppServerClient_1.LlamaCppServerClient; } });
|
|
28
33
|
// --- Utilities ---
|
|
29
34
|
var template_1 = require("./prompting/template");
|
|
30
35
|
Object.defineProperty(exports, "renderTemplate", { enumerable: true, get: function () { return template_1.renderTemplate; } });
|
|
@@ -37,3 +42,5 @@ Object.defineProperty(exports, "parseStructuredContent", { enumerable: true, get
|
|
|
37
42
|
Object.defineProperty(exports, "parseRoleTags", { enumerable: true, get: function () { return parser_1.parseRoleTags; } });
|
|
38
43
|
Object.defineProperty(exports, "extractInitialTaggedContent", { enumerable: true, get: function () { return parser_1.extractInitialTaggedContent; } });
|
|
39
44
|
Object.defineProperty(exports, "parseTemplateWithMetadata", { enumerable: true, get: function () { return parser_1.parseTemplateWithMetadata; } });
|
|
45
|
+
var config_1 = require("./llm/config");
|
|
46
|
+
Object.defineProperty(exports, "createFallbackModelInfo", { enumerable: true, get: function () { return config_1.createFallbackModelInfo; } });
|