genai-lite 0.3.3 β†’ 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/README.md +421 -51
  2. package/dist/index.d.ts +5 -0
  3. package/dist/index.js +8 -1
  4. package/dist/llm/LLMService.d.ts +29 -2
  5. package/dist/llm/LLMService.js +67 -36
  6. package/dist/llm/clients/LlamaCppClientAdapter.d.ts +116 -0
  7. package/dist/llm/clients/LlamaCppClientAdapter.js +289 -0
  8. package/dist/llm/clients/LlamaCppServerClient.d.ts +161 -0
  9. package/dist/llm/clients/LlamaCppServerClient.js +192 -0
  10. package/dist/llm/config.d.ts +12 -0
  11. package/dist/llm/config.js +81 -4
  12. package/dist/llm/services/ModelResolver.js +13 -13
  13. package/dist/llm/services/SettingsManager.js +17 -11
  14. package/dist/llm/types.d.ts +87 -22
  15. package/dist/prompting/parser.d.ts +2 -2
  16. package/dist/prompting/parser.js +2 -2
  17. package/dist/providers/fromEnvironment.d.ts +4 -0
  18. package/dist/providers/fromEnvironment.js +8 -0
  19. package/package.json +1 -1
  20. package/dist/llm/LLMService.createMessages.test.d.ts +0 -4
  21. package/dist/llm/LLMService.createMessages.test.js +0 -364
  22. package/dist/llm/LLMService.original.d.ts +0 -147
  23. package/dist/llm/LLMService.original.js +0 -656
  24. package/dist/llm/LLMService.prepareMessage.test.d.ts +0 -1
  25. package/dist/llm/LLMService.prepareMessage.test.js +0 -303
  26. package/dist/llm/LLMService.presets.test.d.ts +0 -1
  27. package/dist/llm/LLMService.presets.test.js +0 -210
  28. package/dist/llm/LLMService.sendMessage.preset.test.d.ts +0 -1
  29. package/dist/llm/LLMService.sendMessage.preset.test.js +0 -153
  30. package/dist/llm/LLMService.test.d.ts +0 -1
  31. package/dist/llm/LLMService.test.js +0 -620
  32. package/dist/llm/clients/AnthropicClientAdapter.test.d.ts +0 -1
  33. package/dist/llm/clients/AnthropicClientAdapter.test.js +0 -273
  34. package/dist/llm/clients/GeminiClientAdapter.test.d.ts +0 -1
  35. package/dist/llm/clients/GeminiClientAdapter.test.js +0 -405
  36. package/dist/llm/clients/MockClientAdapter.test.d.ts +0 -1
  37. package/dist/llm/clients/MockClientAdapter.test.js +0 -250
  38. package/dist/llm/clients/OpenAIClientAdapter.test.d.ts +0 -1
  39. package/dist/llm/clients/OpenAIClientAdapter.test.js +0 -258
  40. package/dist/llm/clients/adapterErrorUtils.test.d.ts +0 -1
  41. package/dist/llm/clients/adapterErrorUtils.test.js +0 -123
  42. package/dist/llm/config.test.d.ts +0 -1
  43. package/dist/llm/config.test.js +0 -176
  44. package/dist/llm/services/AdapterRegistry.test.d.ts +0 -1
  45. package/dist/llm/services/AdapterRegistry.test.js +0 -239
  46. package/dist/llm/services/ModelResolver.test.d.ts +0 -1
  47. package/dist/llm/services/ModelResolver.test.js +0 -158
  48. package/dist/llm/services/PresetManager.test.d.ts +0 -1
  49. package/dist/llm/services/PresetManager.test.js +0 -210
  50. package/dist/llm/services/RequestValidator.test.d.ts +0 -1
  51. package/dist/llm/services/RequestValidator.test.js +0 -159
  52. package/dist/llm/services/SettingsManager.test.d.ts +0 -1
  53. package/dist/llm/services/SettingsManager.test.js +0 -266
  54. package/dist/prompting/builder.d.ts +0 -38
  55. package/dist/prompting/builder.js +0 -63
  56. package/dist/prompting/builder.test.d.ts +0 -4
  57. package/dist/prompting/builder.test.js +0 -109
  58. package/dist/prompting/content.test.d.ts +0 -4
  59. package/dist/prompting/content.test.js +0 -212
  60. package/dist/prompting/parser.test.d.ts +0 -4
  61. package/dist/prompting/parser.test.js +0 -464
  62. package/dist/prompting/template.test.d.ts +0 -1
  63. package/dist/prompting/template.test.js +0 -250
  64. package/dist/providers/fromEnvironment.test.d.ts +0 -1
  65. package/dist/providers/fromEnvironment.test.js +0 -46
package/README.md CHANGED
@@ -1,10 +1,11 @@
1
1
  # genai-lite
2
2
 
3
- A lightweight, portable Node.js/TypeScript library providing a unified interface for interacting with multiple Generative AI providers (OpenAI, Anthropic, Google Gemini, Mistral, and more).
3
+ A lightweight, portable Node.js/TypeScript library providing a unified interface for interacting with multiple Generative AI providersβ€”both cloud-based (OpenAI, Anthropic, Google Gemini, Mistral) and local (llama.cpp).
4
4
 
5
5
  ## Features
6
6
 
7
7
  - πŸ”Œ **Unified API** - Single interface for multiple AI providers
8
+ - 🏠 **Local & Cloud Models** - Run models locally with llama.cpp or use cloud APIs
8
9
  - πŸ” **Flexible API Key Management** - Bring your own key storage solution
9
10
  - πŸ“¦ **Zero Electron Dependencies** - Works in any Node.js environment
10
11
  - 🎯 **TypeScript First** - Full type safety and IntelliSense support
@@ -21,13 +22,14 @@ npm install genai-lite
21
22
 
22
23
  ## Quick Start
23
24
 
25
+ ### Cloud Providers (OpenAI, Anthropic, Gemini, Mistral)
26
+
24
27
  ```typescript
25
28
  import { LLMService, fromEnvironment } from 'genai-lite';
26
29
 
27
30
  // Create service with environment variable API key provider
28
31
  const llmService = new LLMService(fromEnvironment);
29
32
 
30
- // Option 1: Direct message sending
31
33
  const response = await llmService.sendMessage({
32
34
  providerId: 'openai',
33
35
  modelId: 'gpt-4.1-mini',
@@ -37,26 +39,47 @@ const response = await llmService.sendMessage({
37
39
  ]
38
40
  });
39
41
 
40
- // Option 2: Create messages from template (recommended for complex prompts)
41
- const { messages } = await llmService.createMessages({
42
- template: '<SYSTEM>You are a helpful assistant.</SYSTEM><USER>Hello, how are you?</USER>',
43
- providerId: 'openai',
44
- modelId: 'gpt-4.1-mini'
45
- });
42
+ if (response.object === 'chat.completion') {
43
+ console.log(response.choices[0].message.content);
44
+ } else {
45
+ console.error('Error:', response.error.message);
46
+ }
47
+ ```
46
48
 
47
- const response2 = await llmService.sendMessage({
48
- providerId: 'openai',
49
- modelId: 'gpt-4.1-mini',
50
- messages
49
+ ### Local Models (llama.cpp)
50
+
51
+ ```typescript
52
+ import { LLMService } from 'genai-lite';
53
+
54
+ // Start llama.cpp server first: llama-server -m /path/to/model.gguf --port 8080
55
+ const llmService = new LLMService(async () => 'not-needed');
56
+
57
+ const response = await llmService.sendMessage({
58
+ providerId: 'llamacpp',
59
+ modelId: 'llama-3-8b-instruct', // Must match your loaded model
60
+ messages: [
61
+ { role: 'system', content: 'You are a helpful assistant.' },
62
+ { role: 'user', content: 'Explain quantum computing briefly.' }
63
+ ]
51
64
  });
52
65
 
53
66
  if (response.object === 'chat.completion') {
54
67
  console.log(response.choices[0].message.content);
55
- } else {
56
- console.error('Error:', response.error.message);
57
68
  }
58
69
  ```
59
70
 
71
+ See the [llama.cpp Integration](#llamacpp-integration) section for setup details.
72
+
73
+ ## Example Application
74
+
75
+ For a complete, production-ready example showcasing all genai-lite capabilities, see the **[chat-demo](examples/chat-demo)** interactive web application. The demo includes:
76
+ - Multi-provider chat interface with all supported providers
77
+ - Template rendering and model presets
78
+ - llama.cpp utilities (tokenization, embeddings, health checks)
79
+ - Settings persistence, export/import features
80
+
81
+ The chat-demo serves as both a comprehensive showcase and a quick-test environment for library changes.
82
+
60
83
  ## API Key Management
61
84
 
62
85
  genai-lite uses a flexible API key provider pattern. You can use the built-in environment variable provider or create your own:
@@ -124,6 +147,64 @@ const llmService = new LLMService(myKeyProvider);
124
147
  - `codestral-2501` - Specialized for code generation
125
148
  - `devstral-small-2505` - Compact development-focused model
126
149
 
150
+ ### llama.cpp (Local Models)
151
+
152
+ Run models locally via [llama.cpp](https://github.com/ggml-org/llama.cpp) server. Model IDs can be any nameβ€”they're not validated since you load your own GGUF models.
153
+
154
+ **Example models:**
155
+ - `llama-3-8b-instruct` - Llama 3 8B Instruct
156
+ - `llama-3-70b-instruct` - Llama 3 70B Instruct
157
+ - `mistral-7b-instruct` - Mistral 7B Instruct
158
+ - `my-custom-model` - Any custom model you've loaded
159
+
160
+ **Setup:**
161
+
162
+ 1. Start llama.cpp server with your model:
163
+ ```bash
164
+ llama-server -m /path/to/model.gguf --port 8080
165
+ ```
166
+
167
+ 2. Use with genai-lite (no API key needed):
168
+ ```typescript
169
+ import { LLMService } from 'genai-lite';
170
+
171
+ // API key can be any string for llama.cpp
172
+ const service = new LLMService(async () => 'not-needed');
173
+
174
+ const response = await service.sendMessage({
175
+ providerId: 'llamacpp',
176
+ modelId: 'llama-3-8b-instruct', // Must match your loaded model name
177
+ messages: [{ role: 'user', content: 'Hello!' }]
178
+ });
179
+ ```
180
+
181
+ 3. Configure server URL via environment variable:
182
+ ```bash
183
+ export LLAMACPP_API_BASE_URL=http://localhost:8080
184
+ ```
185
+
186
+ **Advanced features** - Access non-LLM endpoints:
187
+
188
+ ```typescript
189
+ import { LlamaCppServerClient } from 'genai-lite';
190
+
191
+ const client = new LlamaCppServerClient('http://localhost:8080');
192
+
193
+ // Check server health
194
+ const health = await client.getHealth();
195
+
196
+ // Tokenize text
197
+ const { tokens } = await client.tokenize('Hello world');
198
+
199
+ // Generate embeddings
200
+ const { embedding } = await client.createEmbedding('Some text');
201
+
202
+ // Code completion
203
+ const result = await client.infill('def hello():\n', '\nprint("done")');
204
+ ```
205
+
206
+ See the [llama.cpp Integration](#llamacpp-integration) section for details.
207
+
127
208
  ### Models with Reasoning Support
128
209
 
129
210
  Some models include advanced reasoning/thinking capabilities that enhance their problem-solving abilities:
@@ -214,9 +295,11 @@ if (response.object === 'chat.completion' && response.choices[0].reasoning) {
214
295
  - Not all models support reasoning - check the [supported models](#models-with-reasoning-support) list
215
296
  - The `reasoning` field in the response contains the model's thought process (when available)
216
297
 
217
- ### Automatic Thinking Extraction
298
+ ### Thinking Extraction and Enforcement
299
+
300
+ For models without native reasoning, you can prompt them to output reasoning in XML tags like `<thinking>`. The library then extracts these tags and moves the content to the standardized `reasoning` field, providing a consistent interface across all models.
218
301
 
219
- genai-lite can capture reasoning from any model by automatically extracting content wrapped in XML tags. When models output their thinking process in tags like `<thinking>`, the library automatically moves this content to the standardized `reasoning` field. This works with all models, providing a consistent interface for accessing model reasoning:
302
+ **Key point:** The library doesn't make models think automaticallyβ€”you must explicitly instruct non-reasoning models to use thinking tags in your prompt. The library then enforces that these tags are present (for non-reasoning models) or accepts native reasoning (for reasoning models).
220
303
 
221
304
  ```typescript
222
305
  // Prompt the model to think step-by-step in a <thinking> tag
@@ -231,7 +314,7 @@ const response = await llmService.sendMessage({
231
314
  content: 'Please think through this problem step by step before answering: What is 15% of 240?'
232
315
  }],
233
316
  settings: {
234
- thinkingExtraction: { enabled: true } // Must explicitly enable
317
+ thinkingTagFallback: { enabled: true } // Must explicitly enable
235
318
  }
236
319
  });
237
320
 
@@ -253,25 +336,25 @@ const response = await llmService.sendMessage({
253
336
  modelId: 'claude-3-5-haiku-20241022',
254
337
  messages: [{ role: 'user', content: 'Solve this step by step...' }],
255
338
  settings: {
256
- thinkingExtraction: {
339
+ thinkingTagFallback: {
257
340
  enabled: true, // Must explicitly enable (default: false)
258
- tag: 'scratchpad', // Custom tag name (default: 'thinking')
259
- onMissing: 'auto' // Smart enforcement (see below)
341
+ tagName: 'scratchpad', // Custom tag name (default: 'thinking')
342
+ enforce: true // Smart enforcement (see below)
260
343
  }
261
344
  }
262
345
  });
263
346
  ```
264
347
 
265
- **The `onMissing` Property:**
348
+ **The `enforce` Property:**
266
349
 
267
- The `onMissing` property controls what happens when the expected thinking tag is not found:
350
+ The `enforce` boolean controls whether thinking tags are required when native reasoning is not active:
268
351
 
269
- - `'ignore'`: Silently continue without the tag
270
- - `'warn'`: Log a warning but continue processing
271
- - `'error'`: Return an error response with the original response preserved in `partialResponse`
272
- - `'auto'` (default): Intelligently decide based on the model's native reasoning capabilities
352
+ - `enforce: true` - Error if tags missing AND native reasoning not active (smart enforcement)
353
+ - `enforce: false` (default) - Extract tags if present, never error
273
354
 
274
- **How `'auto'` Mode Works:**
355
+ The enforcement is **always smart** - it automatically checks if native reasoning is active and only enforces when the model needs tags as a fallback.
356
+
357
+ **How Smart Enforcement Works:**
275
358
 
276
359
  ```typescript
277
360
  // With non-native reasoning models (e.g., GPT-4)
@@ -286,10 +369,10 @@ const response = await llmService.sendMessage({
286
369
  content: 'What is 15% of 240?'
287
370
  }],
288
371
  settings: {
289
- thinkingExtraction: { enabled: true } // onMissing: 'auto' is default
372
+ thinkingTagFallback: { enabled: true, enforce: true }
290
373
  }
291
374
  });
292
- // Result: ERROR if <thinking> tag is missing (strict enforcement)
375
+ // Result: ERROR if <thinking> tag is missing (native reasoning not active)
293
376
  // The response is still accessible via errorResponse.partialResponse
294
377
 
295
378
  // With native reasoning models (e.g., Claude with reasoning enabled)
@@ -299,10 +382,10 @@ const response = await llmService.sendMessage({
299
382
  messages: [/* same prompt */],
300
383
  settings: {
301
384
  reasoning: { enabled: true },
302
- thinkingExtraction: { enabled: true }
385
+ thinkingTagFallback: { enabled: true, enforce: true }
303
386
  }
304
387
  });
305
- // Result: SUCCESS even if <thinking> tag is missing (lenient for native reasoning)
388
+ // Result: SUCCESS even if <thinking> tag is missing (native reasoning is active)
306
389
  ```
307
390
 
308
391
  This intelligent enforcement ensures that:
@@ -429,13 +512,10 @@ The library provides a powerful `createMessages` method that combines template r
429
512
  // Basic example: Create model-aware messages
430
513
  const { messages, modelContext } = await llmService.createMessages({
431
514
  template: `
432
- <SYSTEM>
433
- You are a {{ thinking_enabled ? "thoughtful" : "helpful" }} assistant.
434
- {{ thinking_available && !thinking_enabled ? "Note: Reasoning mode is available for complex problems." : "" }}
435
- </SYSTEM>
515
+ <SYSTEM>You are a helpful assistant.</SYSTEM>
436
516
  <USER>{{ question }}</USER>
437
517
  `,
438
- variables: {
518
+ variables: {
439
519
  question: 'What is the optimal algorithm for finding the shortest path in a weighted graph?'
440
520
  },
441
521
  presetId: 'anthropic-claude-3-7-sonnet-20250219-thinking'
@@ -479,14 +559,26 @@ The method provides:
479
559
  - **Template Rendering**: Full support for conditionals and variable substitution
480
560
  - **Role Tag Parsing**: Converts `<SYSTEM>`, `<USER>`, and `<ASSISTANT>` tags to messages
481
561
 
482
- Available model context variables:
483
- - `thinking_enabled`: Whether reasoning/thinking is enabled for this request
484
- - `thinking_available`: Whether the model supports reasoning/thinking
562
+ **Available model context variables:**
563
+
564
+ - `native_reasoning_active`: Whether native reasoning is **currently active** for this request
565
+ - `true`: The model is using built-in reasoning (e.g., Claude 4, o4-mini, Gemini 2.5 Pro with reasoning enabled)
566
+ - `false`: No native reasoning is active (either because the model doesn't support it, or it's been disabled)
567
+ - `native_reasoning_capable`: Whether the model **has the capability** to use native reasoning
568
+ - `true`: Model supports native reasoning (may or may not be enabled)
569
+ - `false`: Model does not support native reasoning
485
570
  - `model_id`: The resolved model ID
486
571
  - `provider_id`: The resolved provider ID
487
572
  - `reasoning_effort`: The reasoning effort level if specified
488
573
  - `reasoning_max_tokens`: The reasoning token budget if specified
489
574
 
575
+ **Best Practice for Templates:**
576
+ When adding thinking tag instructions to your templates, **always use `requires_tags_for_thinking`** (the NOT operator). This ensures:
577
+ - Models with active native reasoning get clean, direct prompts
578
+ - Models without native reasoning get explicit instructions to use `<thinking>` tags
579
+
580
+ Example: `{{ requires_tags_for_thinking ? ' Write your reasoning in <thinking> tags first.' : '' }}`
581
+
490
582
  #### Advanced Features
491
583
 
492
584
  **Dynamic Role Injection:**
@@ -536,7 +628,7 @@ const response = await llmService.sendMessage({
536
628
  modelId: 'gpt-4.1',
537
629
  messages,
538
630
  settings: {
539
- thinkingExtraction: { enabled: true } // Default, but shown for clarity
631
+ thinkingTagFallback: { enabled: true } // Default, but shown for clarity
540
632
  }
541
633
  });
542
634
 
@@ -559,7 +651,7 @@ const creativeWritingTemplate = `
559
651
  "settings": {
560
652
  "temperature": 0.9,
561
653
  "maxTokens": 3000,
562
- "thinkingExtraction": { "enabled": true, "tag": "reasoning" }
654
+ "thinkingTagFallback": { "enabled": true, "tagName": "reasoning" }
563
655
  }
564
656
  }
565
657
  </META>
@@ -666,6 +758,261 @@ if (response.object === 'error') {
666
758
  }
667
759
  ```
668
760
 
761
+ ## llama.cpp Integration
762
+
763
+ `genai-lite` provides comprehensive support for running local LLMs via [llama.cpp](https://github.com/ggml-org/llama.cpp) server, enabling completely offline AI capabilities with the same unified interface.
764
+
765
+ ### Why llama.cpp?
766
+
767
+ - **Privacy**: All model inference runs locally on your hardware
768
+ - **Cost**: No API costs after initial model download
769
+ - **Control**: Use any GGUF model from Hugging Face
770
+ - **Performance**: Optimized C++ implementation with hardware acceleration
771
+
772
+ ### Setup
773
+
774
+ #### 1. Install llama.cpp
775
+
776
+ ```bash
777
+ # Clone and build llama.cpp
778
+ git clone https://github.com/ggml-org/llama.cpp
779
+ cd llama.cpp
780
+ make
781
+
782
+ # Or download pre-built binaries from releases
783
+ ```
784
+
785
+ #### 2. Download a Model
786
+
787
+ Get GGUF models from Hugging Face, for example:
788
+ - [Meta-Llama-3.1-8B-Instruct-GGUF](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
789
+ - [Mistral-7B-Instruct-v0.3-GGUF](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
790
+
791
+ #### 3. Start the Server
792
+
793
+ ```bash
794
+ # Basic usage
795
+ llama-server -m /path/to/model.gguf --port 8080
796
+
797
+ # With more options
798
+ llama-server -m /path/to/model.gguf \
799
+ --port 8080 \
800
+ -c 4096 \ # Context size
801
+ -np 4 \ # Parallel requests
802
+ --threads 8 # CPU threads
803
+ ```
804
+
805
+ ### Basic Usage
806
+
807
+ ```typescript
808
+ import { LLMService } from 'genai-lite';
809
+
810
+ // llama.cpp doesn't need API keys
811
+ const service = new LLMService(async () => 'not-needed');
812
+
813
+ const response = await service.sendMessage({
814
+ providerId: 'llamacpp',
815
+ modelId: 'llama-3-8b-instruct', // Arbitrary name matching your model
816
+ messages: [
817
+ { role: 'system', content: 'You are a helpful assistant.' },
818
+ { role: 'user', content: 'Explain quantum computing in simple terms.' }
819
+ ],
820
+ settings: {
821
+ temperature: 0.7,
822
+ maxTokens: 500
823
+ }
824
+ });
825
+
826
+ if (response.object === 'chat.completion') {
827
+ console.log(response.choices[0].message.content);
828
+ }
829
+ ```
830
+
831
+ ### Configuration
832
+
833
+ #### Environment Variable
834
+
835
+ Set the server URL via environment variable (default: `http://localhost:8080`):
836
+
837
+ ```bash
838
+ export LLAMACPP_API_BASE_URL=http://localhost:8080
839
+ ```
840
+
841
+ #### Multiple Servers
842
+
843
+ Register multiple llama.cpp instances for different models:
844
+
845
+ ```typescript
846
+ import { LLMService, LlamaCppClientAdapter } from 'genai-lite';
847
+
848
+ const service = new LLMService(async () => 'not-needed');
849
+
850
+ // Register adapters for different servers/models
851
+ service.registerAdapter(
852
+ 'llamacpp-small',
853
+ new LlamaCppClientAdapter({ baseURL: 'http://localhost:8080' })
854
+ );
855
+
856
+ service.registerAdapter(
857
+ 'llamacpp-large',
858
+ new LlamaCppClientAdapter({ baseURL: 'http://localhost:8081' })
859
+ );
860
+
861
+ // Use them
862
+ const response = await service.sendMessage({
863
+ providerId: 'llamacpp-small',
864
+ modelId: 'llama-3-8b',
865
+ messages: [{ role: 'user', content: 'Hello!' }]
866
+ });
867
+ ```
868
+
869
+ #### Health Checking
870
+
871
+ Enable automatic health checks before requests:
872
+
873
+ ```typescript
874
+ import { LlamaCppClientAdapter } from 'genai-lite';
875
+
876
+ const adapter = new LlamaCppClientAdapter({
877
+ baseURL: 'http://localhost:8080',
878
+ checkHealth: true // Check server status before each request
879
+ });
880
+
881
+ service.registerAdapter('llamacpp', adapter);
882
+ ```
883
+
884
+ ### Advanced Features
885
+
886
+ #### Server Management
887
+
888
+ The `LlamaCppServerClient` class provides access to all llama.cpp server endpoints:
889
+
890
+ ```typescript
891
+ import { LlamaCppServerClient } from 'genai-lite';
892
+
893
+ const client = new LlamaCppServerClient('http://localhost:8080');
894
+
895
+ // Health monitoring
896
+ const health = await client.getHealth();
897
+ console.log(health.status); // 'ok', 'loading', or 'error'
898
+
899
+ // Server properties
900
+ const props = await client.getProps();
901
+ console.log(props.total_slots); // Number of available slots
902
+
903
+ // Performance metrics (if enabled)
904
+ const metrics = await client.getMetrics();
905
+ ```
906
+
907
+ #### Tokenization
908
+
909
+ ```typescript
910
+ const client = new LlamaCppServerClient('http://localhost:8080');
911
+
912
+ // Tokenize text
913
+ const { tokens } = await client.tokenize('Hello, world!');
914
+ console.log(tokens); // [123, 456, 789]
915
+
916
+ // Count tokens before sending to LLM
917
+ const prompt = 'Long text...';
918
+ const { tokens: promptTokens } = await client.tokenize(prompt);
919
+ if (promptTokens.length > 4000) {
920
+ console.log('Prompt too long, truncating...');
921
+ }
922
+
923
+ // Detokenize back to text
924
+ const { content } = await client.detokenize([123, 456, 789]);
925
+ console.log(content); // 'Hello, world!'
926
+ ```
927
+
928
+ #### Text Embeddings
929
+
930
+ ```typescript
931
+ const client = new LlamaCppServerClient('http://localhost:8080');
932
+
933
+ // Generate embeddings for semantic search
934
+ const { embedding } = await client.createEmbedding('Search query text');
935
+ console.log(embedding.length); // e.g., 768 dimensions
936
+
937
+ // With images (for multimodal models)
938
+ const { embedding: multimodalEmbed } = await client.createEmbedding(
939
+ 'Describe this image',
940
+ 'base64_image_data_here'
941
+ );
942
+ ```
943
+
944
+ #### Code Infilling
945
+
946
+ Perfect for code completion in IDEs:
947
+
948
+ ```typescript
949
+ const client = new LlamaCppServerClient('http://localhost:8080');
950
+
951
+ const result = await client.infill(
952
+ 'def calculate_fibonacci(n):\n ', // Prefix (before cursor)
953
+ '\n return result' // Suffix (after cursor)
954
+ );
955
+
956
+ console.log(result.content);
957
+ // Output: "if n <= 1:\n return n\n result = calculate_fibonacci(n-1) + calculate_fibonacci(n-2)"
958
+ ```
959
+
960
+ ### Error Handling
961
+
962
+ ```typescript
963
+ const response = await service.sendMessage({
964
+ providerId: 'llamacpp',
965
+ modelId: 'my-model',
966
+ messages: [{ role: 'user', content: 'Hello' }]
967
+ });
968
+
969
+ if (response.object === 'error') {
970
+ switch (response.error.code) {
971
+ case 'NETWORK_ERROR':
972
+ console.error('Server not running or unreachable');
973
+ break;
974
+ case 'PROVIDER_ERROR':
975
+ console.error('Server error:', response.error.message);
976
+ break;
977
+ default:
978
+ console.error('Unknown error:', response.error);
979
+ }
980
+ }
981
+ ```
982
+
983
+ ### Best Practices
984
+
985
+ 1. **Model Naming**: Use descriptive model IDs (e.g., `llama-3-8b-instruct`) since llama.cpp accepts any name
986
+ 2. **Context Size**: Set appropriate context (`-c` flag) when starting the server
987
+ 3. **Parallel Requests**: Configure slots (`-np`) based on your hardware
988
+ 4. **Health Monitoring**: Enable `checkHealth` for production to detect server issues early
989
+ 5. **Resource Management**: Monitor memory usage; large models need significant RAM
990
+
991
+ ### Troubleshooting
992
+
993
+ **Server not responding:**
994
+ ```bash
995
+ # Check if server is running
996
+ curl http://localhost:8080/health
997
+
998
+ # Should return: {"status":"ok"}
999
+ ```
1000
+
1001
+ **Model loading errors:**
1002
+ ```bash
1003
+ # Increase memory or reduce context size
1004
+ llama-server -m model.gguf --port 8080 -c 2048
1005
+ ```
1006
+
1007
+ **Slow responses:**
1008
+ ```bash
1009
+ # Use quantized models (smaller but faster)
1010
+ # e.g., Q4_K_M, Q5_K_M instead of F16
1011
+
1012
+ # Increase threads
1013
+ llama-server -m model.gguf --threads 16
1014
+ ```
1015
+
669
1016
  ## Using with Electron
670
1017
 
671
1018
  `genai-lite` is designed to work seamlessly within an Electron application's main process, especially when paired with a secure storage solution like `genai-key-storage-lite`.
@@ -709,14 +1056,14 @@ const llmService = new LLMService(electronKeyProvider);
709
1056
  genai-lite is written in TypeScript and provides comprehensive type definitions:
710
1057
 
711
1058
  ```typescript
712
- import type {
1059
+ import type {
713
1060
  LLMChatRequest,
714
1061
  LLMChatRequestWithPreset,
715
1062
  LLMResponse,
716
1063
  LLMFailureResponse,
717
1064
  LLMSettings,
718
1065
  LLMReasoningSettings,
719
- LLMThinkingExtractionSettings,
1066
+ LLMThinkingTagFallbackSettings,
720
1067
  ApiKeyProvider,
721
1068
  ModelPreset,
722
1069
  LLMServiceOptions,
@@ -725,6 +1072,26 @@ import type {
725
1072
  CreateMessagesResult,
726
1073
  TemplateMetadata
727
1074
  } from 'genai-lite';
1075
+
1076
+ // llama.cpp integration types and classes
1077
+ import {
1078
+ LlamaCppClientAdapter,
1079
+ LlamaCppServerClient,
1080
+ createFallbackModelInfo
1081
+ } from 'genai-lite';
1082
+
1083
+ import type {
1084
+ LlamaCppClientConfig,
1085
+ LlamaCppHealthResponse,
1086
+ LlamaCppTokenizeResponse,
1087
+ LlamaCppDetokenizeResponse,
1088
+ LlamaCppEmbeddingResponse,
1089
+ LlamaCppInfillResponse,
1090
+ LlamaCppPropsResponse,
1091
+ LlamaCppMetricsResponse,
1092
+ LlamaCppSlot,
1093
+ LlamaCppSlotsResponse
1094
+ } from 'genai-lite';
728
1095
  ```
729
1096
 
730
1097
  ## Utilities
@@ -968,24 +1335,23 @@ const { messages } = await llmService.createMessages({
968
1335
  presetId: 'openai-gpt-4.1-default' // Optional: adds model context
969
1336
  });
970
1337
 
971
- // Advanced: Leverage model context for adaptive prompts
1338
+ // Advanced: Adaptive prompts based on model capabilities
972
1339
  const { messages, modelContext } = await llmService.createMessages({
973
1340
  template: `
974
1341
  <SYSTEM>
975
- You are a {{ thinking_enabled ? 'analytical problem solver' : 'quick helper' }}.
976
- {{ model_id.includes('claude') ? 'Use your advanced reasoning capabilities.' : '' }}
1342
+ You are a problem-solving assistant.
1343
+ {{ requires_tags_for_thinking ? ' For complex problems, write your reasoning in <thinking> tags before answering.' : '' }}
977
1344
  </SYSTEM>
978
- <USER>
979
- {{ thinking_enabled ? 'Please solve this step-by-step:' : 'Please answer:' }}
980
- {{ question }}
981
- </USER>
1345
+ <USER>{{ question }}</USER>
982
1346
  `,
1347
+ // Note: Use requires_tags_for_thinking (NOT operator) - only instruct models that don't have active native reasoning
983
1348
  variables: { question: 'What causes the seasons on Earth?' },
984
1349
  presetId: 'anthropic-claude-3-7-sonnet-20250219-thinking'
985
1350
  });
986
1351
 
987
1352
  console.log('Model context:', modelContext);
988
- // Output: { thinking_enabled: true, thinking_available: true, model_id: 'claude-3-7-sonnet-20250219', ... }
1353
+ // Output: { native_reasoning_active: true, native_reasoning_capable: true, model_id: 'claude-3-7-sonnet-20250219', ... }
1354
+ // Note: With a reasoning model, the system prompt won't include thinking tag instructions
989
1355
  ```
990
1356
 
991
1357
  **Low-Level Utilities:**
@@ -1106,6 +1472,10 @@ These utilities enable:
1106
1472
  - **Template Reusability**: Define templates once, use with different variables
1107
1473
  - **Type Safety**: Full TypeScript support with LLMMessage types
1108
1474
 
1475
+ ## Examples
1476
+
1477
+ See the **[chat-demo](examples/chat-demo)** application for a complete working example that demonstrates all library features in a production-ready React + Express application.
1478
+
1109
1479
  ## Contributing
1110
1480
 
1111
1481
  Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
package/dist/index.d.ts CHANGED
@@ -5,7 +5,12 @@ export type { ModelPreset } from "./types/presets";
5
5
  export * from "./llm/types";
6
6
  export * from "./llm/clients/types";
7
7
  export { fromEnvironment } from "./providers/fromEnvironment";
8
+ export { LlamaCppClientAdapter } from "./llm/clients/LlamaCppClientAdapter";
9
+ export { LlamaCppServerClient } from "./llm/clients/LlamaCppServerClient";
10
+ export type { LlamaCppClientConfig, } from "./llm/clients/LlamaCppClientAdapter";
11
+ export type { LlamaCppHealthResponse, LlamaCppTokenizeResponse, LlamaCppDetokenizeResponse, LlamaCppEmbeddingResponse, LlamaCppInfillResponse, LlamaCppPropsResponse, LlamaCppMetricsResponse, LlamaCppSlot, LlamaCppSlotsResponse, } from "./llm/clients/LlamaCppServerClient";
8
12
  export { renderTemplate } from "./prompting/template";
9
13
  export { countTokens, getSmartPreview, extractRandomVariables } from "./prompting/content";
10
14
  export { parseStructuredContent, parseRoleTags, extractInitialTaggedContent, parseTemplateWithMetadata } from "./prompting/parser";
11
15
  export type { TemplateMetadata } from "./prompting/parser";
16
+ export { createFallbackModelInfo } from "./llm/config";
package/dist/index.js CHANGED
@@ -14,7 +14,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
14
  for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
15
  };
16
16
  Object.defineProperty(exports, "__esModule", { value: true });
17
- exports.parseTemplateWithMetadata = exports.extractInitialTaggedContent = exports.parseRoleTags = exports.parseStructuredContent = exports.extractRandomVariables = exports.getSmartPreview = exports.countTokens = exports.renderTemplate = exports.fromEnvironment = exports.LLMService = void 0;
17
+ exports.createFallbackModelInfo = exports.parseTemplateWithMetadata = exports.extractInitialTaggedContent = exports.parseRoleTags = exports.parseStructuredContent = exports.extractRandomVariables = exports.getSmartPreview = exports.countTokens = exports.renderTemplate = exports.LlamaCppServerClient = exports.LlamaCppClientAdapter = exports.fromEnvironment = exports.LLMService = void 0;
18
18
  // --- LLM Service ---
19
19
  var LLMService_1 = require("./llm/LLMService");
20
20
  Object.defineProperty(exports, "LLMService", { enumerable: true, get: function () { return LLMService_1.LLMService; } });
@@ -25,6 +25,11 @@ __exportStar(require("./llm/clients/types"), exports);
25
25
  // --- API Key Providers ---
26
26
  var fromEnvironment_1 = require("./providers/fromEnvironment");
27
27
  Object.defineProperty(exports, "fromEnvironment", { enumerable: true, get: function () { return fromEnvironment_1.fromEnvironment; } });
28
+ // --- llama.cpp Integration ---
29
+ var LlamaCppClientAdapter_1 = require("./llm/clients/LlamaCppClientAdapter");
30
+ Object.defineProperty(exports, "LlamaCppClientAdapter", { enumerable: true, get: function () { return LlamaCppClientAdapter_1.LlamaCppClientAdapter; } });
31
+ var LlamaCppServerClient_1 = require("./llm/clients/LlamaCppServerClient");
32
+ Object.defineProperty(exports, "LlamaCppServerClient", { enumerable: true, get: function () { return LlamaCppServerClient_1.LlamaCppServerClient; } });
28
33
  // --- Utilities ---
29
34
  var template_1 = require("./prompting/template");
30
35
  Object.defineProperty(exports, "renderTemplate", { enumerable: true, get: function () { return template_1.renderTemplate; } });
@@ -37,3 +42,5 @@ Object.defineProperty(exports, "parseStructuredContent", { enumerable: true, get
37
42
  Object.defineProperty(exports, "parseRoleTags", { enumerable: true, get: function () { return parser_1.parseRoleTags; } });
38
43
  Object.defineProperty(exports, "extractInitialTaggedContent", { enumerable: true, get: function () { return parser_1.extractInitialTaggedContent; } });
39
44
  Object.defineProperty(exports, "parseTemplateWithMetadata", { enumerable: true, get: function () { return parser_1.parseTemplateWithMetadata; } });
45
+ var config_1 = require("./llm/config");
46
+ Object.defineProperty(exports, "createFallbackModelInfo", { enumerable: true, get: function () { return config_1.createFallbackModelInfo; } });