npm - genai-lite - Versions diffs - 0.3.3 → 0.4.1 - Mend

genai-lite 0.3.3 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/README.md +421 -51
package/dist/index.d.ts +5 -0
package/dist/index.js +8 -1
package/dist/llm/LLMService.d.ts +29 -2
package/dist/llm/LLMService.js +67 -36
package/dist/llm/clients/LlamaCppClientAdapter.d.ts +116 -0
package/dist/llm/clients/LlamaCppClientAdapter.js +289 -0
package/dist/llm/clients/LlamaCppServerClient.d.ts +161 -0
package/dist/llm/clients/LlamaCppServerClient.js +192 -0
package/dist/llm/config.d.ts +12 -0
package/dist/llm/config.js +81 -4
package/dist/llm/services/ModelResolver.js +13 -13
package/dist/llm/services/SettingsManager.js +17 -11
package/dist/llm/types.d.ts +87 -22
package/dist/prompting/parser.d.ts +2 -2
package/dist/prompting/parser.js +2 -2
package/dist/providers/fromEnvironment.d.ts +4 -0
package/dist/providers/fromEnvironment.js +8 -0
package/package.json +1 -1
package/dist/llm/LLMService.createMessages.test.d.ts +0 -4
package/dist/llm/LLMService.createMessages.test.js +0 -364
package/dist/llm/LLMService.original.d.ts +0 -147
package/dist/llm/LLMService.original.js +0 -656
package/dist/llm/LLMService.prepareMessage.test.d.ts +0 -1
package/dist/llm/LLMService.prepareMessage.test.js +0 -303
package/dist/llm/LLMService.presets.test.d.ts +0 -1
package/dist/llm/LLMService.presets.test.js +0 -210
package/dist/llm/LLMService.sendMessage.preset.test.d.ts +0 -1
package/dist/llm/LLMService.sendMessage.preset.test.js +0 -153
package/dist/llm/LLMService.test.d.ts +0 -1
package/dist/llm/LLMService.test.js +0 -620
package/dist/llm/clients/AnthropicClientAdapter.test.d.ts +0 -1
package/dist/llm/clients/AnthropicClientAdapter.test.js +0 -273
package/dist/llm/clients/GeminiClientAdapter.test.d.ts +0 -1
package/dist/llm/clients/GeminiClientAdapter.test.js +0 -405
package/dist/llm/clients/MockClientAdapter.test.d.ts +0 -1
package/dist/llm/clients/MockClientAdapter.test.js +0 -250
package/dist/llm/clients/OpenAIClientAdapter.test.d.ts +0 -1
package/dist/llm/clients/OpenAIClientAdapter.test.js +0 -258
package/dist/llm/clients/adapterErrorUtils.test.d.ts +0 -1
package/dist/llm/clients/adapterErrorUtils.test.js +0 -123
package/dist/llm/config.test.d.ts +0 -1
package/dist/llm/config.test.js +0 -176
package/dist/llm/services/AdapterRegistry.test.d.ts +0 -1
package/dist/llm/services/AdapterRegistry.test.js +0 -239
package/dist/llm/services/ModelResolver.test.d.ts +0 -1
package/dist/llm/services/ModelResolver.test.js +0 -158
package/dist/llm/services/PresetManager.test.d.ts +0 -1
package/dist/llm/services/PresetManager.test.js +0 -210
package/dist/llm/services/RequestValidator.test.d.ts +0 -1
package/dist/llm/services/RequestValidator.test.js +0 -159
package/dist/llm/services/SettingsManager.test.d.ts +0 -1
package/dist/llm/services/SettingsManager.test.js +0 -266
package/dist/prompting/builder.d.ts +0 -38
package/dist/prompting/builder.js +0 -63
package/dist/prompting/builder.test.d.ts +0 -4
package/dist/prompting/builder.test.js +0 -109
package/dist/prompting/content.test.d.ts +0 -4
package/dist/prompting/content.test.js +0 -212
package/dist/prompting/parser.test.d.ts +0 -4
package/dist/prompting/parser.test.js +0 -464
package/dist/prompting/template.test.d.ts +0 -1
package/dist/prompting/template.test.js +0 -250
package/dist/providers/fromEnvironment.test.d.ts +0 -1
package/dist/providers/fromEnvironment.test.js +0 -46

package/README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 # genai-lite
-A lightweight, portable Node.js/TypeScript library providing a unified interface for interacting with multiple Generative AI providers (OpenAI, Anthropic, Google Gemini, Mistral, and more).
+A lightweight, portable Node.js/TypeScript library providing a unified interface for interacting with multiple Generative AI providers—both cloud-based (OpenAI, Anthropic, Google Gemini, Mistral) and local (llama.cpp).
 ## Features
 - 🔌 **Unified API** - Single interface for multiple AI providers
+- 🏠 **Local & Cloud Models** - Run models locally with llama.cpp or use cloud APIs
 - 🔐 **Flexible API Key Management** - Bring your own key storage solution
 - 📦 **Zero Electron Dependencies** - Works in any Node.js environment
 - 🎯 **TypeScript First** - Full type safety and IntelliSense support
@@ -21,13 +22,14 @@ npm install genai-lite
 ## Quick Start
+### Cloud Providers (OpenAI, Anthropic, Gemini, Mistral)
 ```typescript
 import { LLMService, fromEnvironment } from 'genai-lite';
 // Create service with environment variable API key provider
 const llmService = new LLMService(fromEnvironment);
-// Option 1: Direct message sending
 const response = await llmService.sendMessage({
   providerId: 'openai',
   modelId: 'gpt-4.1-mini',
@@ -37,26 +39,47 @@ const response = await llmService.sendMessage({
   ]
 });
-// Option 2: Create messages from template (recommended for complex prompts)
-const { messages } = await llmService.createMessages({
-  template: '<SYSTEM>You are a helpful assistant.</SYSTEM><USER>Hello, how are you?</USER>',
-  providerId: 'openai',
-  modelId: 'gpt-4.1-mini'
-});
+if (response.object === 'chat.completion') {
+  console.log(response.choices[0].message.content);
+} else {
+  console.error('Error:', response.error.message);
+}
+```
-const response2 = await llmService.sendMessage({
-  providerId: 'openai',
-  modelId: 'gpt-4.1-mini',
-  messages
+### Local Models (llama.cpp)
+```typescript
+import { LLMService } from 'genai-lite';
+// Start llama.cpp server first: llama-server -m /path/to/model.gguf --port 8080
+const llmService = new LLMService(async () => 'not-needed');
+const response = await llmService.sendMessage({
+  providerId: 'llamacpp',
+  modelId: 'llama-3-8b-instruct',  // Must match your loaded model
+  messages: [
+    { role: 'system', content: 'You are a helpful assistant.' },
+    { role: 'user', content: 'Explain quantum computing briefly.' }
+  ]
 });
 if (response.object === 'chat.completion') {
   console.log(response.choices[0].message.content);
-} else {
-  console.error('Error:', response.error.message);
 }
 ```
+See the [llama.cpp Integration](#llamacpp-integration) section for setup details.
+## Example Application
+For a complete, production-ready example showcasing all genai-lite capabilities, see the **[chat-demo](examples/chat-demo)** interactive web application. The demo includes:
+- Multi-provider chat interface with all supported providers
+- Template rendering and model presets
+- llama.cpp utilities (tokenization, embeddings, health checks)
+- Settings persistence, export/import features
+The chat-demo serves as both a comprehensive showcase and a quick-test environment for library changes.
 ## API Key Management
 genai-lite uses a flexible API key provider pattern. You can use the built-in environment variable provider or create your own:
@@ -124,6 +147,64 @@ const llmService = new LLMService(myKeyProvider);
 - `codestral-2501` - Specialized for code generation
 - `devstral-small-2505` - Compact development-focused model
+### llama.cpp (Local Models)
+Run models locally via [llama.cpp](https://github.com/ggml-org/llama.cpp) server. Model IDs can be any name—they're not validated since you load your own GGUF models.
+**Example models:**
+- `llama-3-8b-instruct` - Llama 3 8B Instruct
+- `llama-3-70b-instruct` - Llama 3 70B Instruct
+- `mistral-7b-instruct` - Mistral 7B Instruct
+- `my-custom-model` - Any custom model you've loaded
+**Setup:**
+1. Start llama.cpp server with your model:
+```bash
+llama-server -m /path/to/model.gguf --port 8080
+```
+2. Use with genai-lite (no API key needed):
+```typescript
+import { LLMService } from 'genai-lite';
+// API key can be any string for llama.cpp
+const service = new LLMService(async () => 'not-needed');
+const response = await service.sendMessage({
+  providerId: 'llamacpp',
+  modelId: 'llama-3-8b-instruct', // Must match your loaded model name
+  messages: [{ role: 'user', content: 'Hello!' }]
+});
+```
+3. Configure server URL via environment variable:
+```bash
+export LLAMACPP_API_BASE_URL=http://localhost:8080
+```
+**Advanced features** - Access non-LLM endpoints:
+```typescript
+import { LlamaCppServerClient } from 'genai-lite';
+const client = new LlamaCppServerClient('http://localhost:8080');
+// Check server health
+const health = await client.getHealth();
+// Tokenize text
+const { tokens } = await client.tokenize('Hello world');
+// Generate embeddings
+const { embedding } = await client.createEmbedding('Some text');
+// Code completion
+const result = await client.infill('def hello():\n', '\nprint("done")');
+```
+See the [llama.cpp Integration](#llamacpp-integration) section for details.
 ### Models with Reasoning Support
 Some models include advanced reasoning/thinking capabilities that enhance their problem-solving abilities:
@@ -214,9 +295,11 @@ if (response.object === 'chat.completion' && response.choices[0].reasoning) {
 - Not all models support reasoning - check the [supported models](#models-with-reasoning-support) list
 - The `reasoning` field in the response contains the model's thought process (when available)
-### Automatic Thinking Extraction
+### Thinking Extraction and Enforcement
+For models without native reasoning, you can prompt them to output reasoning in XML tags like `<thinking>`. The library then extracts these tags and moves the content to the standardized `reasoning` field, providing a consistent interface across all models.
-genai-lite can capture reasoning from any model by automatically extracting content wrapped in XML tags. When models output their thinking process in tags like `<thinking>`, the library automatically moves this content to the standardized `reasoning` field. This works with all models, providing a consistent interface for accessing model reasoning:
+**Key point:** The library doesn't make models think automatically—you must explicitly instruct non-reasoning models to use thinking tags in your prompt. The library then enforces that these tags are present (for non-reasoning models) or accepts native reasoning (for reasoning models).
 ```typescript
 // Prompt the model to think step-by-step in a <thinking> tag
@@ -231,7 +314,7 @@ const response = await llmService.sendMessage({
     content: 'Please think through this problem step by step before answering: What is 15% of 240?'
   }],
   settings: {
-    thinkingExtraction: { enabled: true } // Must explicitly enable
+    thinkingTagFallback: { enabled: true } // Must explicitly enable
   }
 });
@@ -253,25 +336,25 @@ const response = await llmService.sendMessage({
   modelId: 'claude-3-5-haiku-20241022',
   messages: [{ role: 'user', content: 'Solve this step by step...' }],
   settings: {
-    thinkingExtraction: {
+    thinkingTagFallback: {
       enabled: true,     // Must explicitly enable (default: false)
-      tag: 'scratchpad', // Custom tag name (default: 'thinking')
-      onMissing: 'auto'  // Smart enforcement (see below)
+      tagName: 'scratchpad', // Custom tag name (default: 'thinking')
+      enforce: true  // Smart enforcement (see below)
     }
   }
 });
 ```
-**The `onMissing` Property:**
+**The `enforce` Property:**
-The `onMissing` property controls what happens when the expected thinking tag is not found:
+The `enforce` boolean controls whether thinking tags are required when native reasoning is not active:
-- `'ignore'`: Silently continue without the tag
-- `'warn'`: Log a warning but continue processing
-- `'error'`: Return an error response with the original response preserved in `partialResponse`
-- `'auto'` (default): Intelligently decide based on the model's native reasoning capabilities
+- `enforce: true` - Error if tags missing AND native reasoning not active (smart enforcement)
+- `enforce: false` (default) - Extract tags if present, never error
-**How `'auto'` Mode Works:**
+The enforcement is **always smart** - it automatically checks if native reasoning is active and only enforces when the model needs tags as a fallback.
+**How Smart Enforcement Works:**
 ```typescript
 // With non-native reasoning models (e.g., GPT-4)
@@ -286,10 +369,10 @@ const response = await llmService.sendMessage({
     content: 'What is 15% of 240?'
   }],
   settings: {
-    thinkingExtraction: { enabled: true } // onMissing: 'auto' is default
+    thinkingTagFallback: { enabled: true, enforce: true }
   }
 });
-// Result: ERROR if <thinking> tag is missing (strict enforcement)
+// Result: ERROR if <thinking> tag is missing (native reasoning not active)
 // The response is still accessible via errorResponse.partialResponse
 // With native reasoning models (e.g., Claude with reasoning enabled)
@@ -299,10 +382,10 @@ const response = await llmService.sendMessage({
   messages: [/* same prompt */],
   settings: {
     reasoning: { enabled: true },
-    thinkingExtraction: { enabled: true }
+    thinkingTagFallback: { enabled: true, enforce: true }
   }
 });
-// Result: SUCCESS even if <thinking> tag is missing (lenient for native reasoning)
+// Result: SUCCESS even if <thinking> tag is missing (native reasoning is active)
 ```
 This intelligent enforcement ensures that:
@@ -429,13 +512,10 @@ The library provides a powerful `createMessages` method that combines template r
 // Basic example: Create model-aware messages
 const { messages, modelContext } = await llmService.createMessages({
   template: `
-    <SYSTEM>
-      You are a {{ thinking_enabled ? "thoughtful" : "helpful" }} assistant.
-      {{ thinking_available && !thinking_enabled ? "Note: Reasoning mode is available for complex problems." : "" }}
-    </SYSTEM>
+    <SYSTEM>You are a helpful assistant.</SYSTEM>
     <USER>{{ question }}</USER>
   `,
-  variables: {
+  variables: {
     question: 'What is the optimal algorithm for finding the shortest path in a weighted graph?'
   },
   presetId: 'anthropic-claude-3-7-sonnet-20250219-thinking'
@@ -479,14 +559,26 @@ The method provides:
 - **Template Rendering**: Full support for conditionals and variable substitution
 - **Role Tag Parsing**: Converts `<SYSTEM>`, `<USER>`, and `<ASSISTANT>` tags to messages
-Available model context variables:
-- `thinking_enabled`: Whether reasoning/thinking is enabled for this request
-- `thinking_available`: Whether the model supports reasoning/thinking
+**Available model context variables:**
+- `native_reasoning_active`: Whether native reasoning is **currently active** for this request
+  - `true`: The model is using built-in reasoning (e.g., Claude 4, o4-mini, Gemini 2.5 Pro with reasoning enabled)
+  - `false`: No native reasoning is active (either because the model doesn't support it, or it's been disabled)
+- `native_reasoning_capable`: Whether the model **has the capability** to use native reasoning
+  - `true`: Model supports native reasoning (may or may not be enabled)
+  - `false`: Model does not support native reasoning
 - `model_id`: The resolved model ID
 - `provider_id`: The resolved provider ID
 - `reasoning_effort`: The reasoning effort level if specified
 - `reasoning_max_tokens`: The reasoning token budget if specified
+**Best Practice for Templates:**
+When adding thinking tag instructions to your templates, **always use `requires_tags_for_thinking`** (the NOT operator). This ensures:
+- Models with active native reasoning get clean, direct prompts
+- Models without native reasoning get explicit instructions to use `<thinking>` tags
+Example: `{{ requires_tags_for_thinking ? ' Write your reasoning in <thinking> tags first.' : '' }}`
 #### Advanced Features
 **Dynamic Role Injection:**
@@ -536,7 +628,7 @@ const response = await llmService.sendMessage({
   modelId: 'gpt-4.1',
   messages,
   settings: {
-    thinkingExtraction: { enabled: true } // Default, but shown for clarity
+    thinkingTagFallback: { enabled: true } // Default, but shown for clarity
   }
 });
@@ -559,7 +651,7 @@ const creativeWritingTemplate = `
   "settings": {
     "temperature": 0.9,
     "maxTokens": 3000,
-    "thinkingExtraction": { "enabled": true, "tag": "reasoning" }
+    "thinkingTagFallback": { "enabled": true, "tagName": "reasoning" }
   }
 }
 </META>
@@ -666,6 +758,261 @@ if (response.object === 'error') {
 }
 ```
+## llama.cpp Integration
+`genai-lite` provides comprehensive support for running local LLMs via [llama.cpp](https://github.com/ggml-org/llama.cpp) server, enabling completely offline AI capabilities with the same unified interface.
+### Why llama.cpp?
+- **Privacy**: All model inference runs locally on your hardware
+- **Cost**: No API costs after initial model download
+- **Control**: Use any GGUF model from Hugging Face
+- **Performance**: Optimized C++ implementation with hardware acceleration
+### Setup
+#### 1. Install llama.cpp
+```bash
+# Clone and build llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+make
+# Or download pre-built binaries from releases
+```
+#### 2. Download a Model
+Get GGUF models from Hugging Face, for example:
+- [Meta-Llama-3.1-8B-Instruct-GGUF](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
+- [Mistral-7B-Instruct-v0.3-GGUF](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
+#### 3. Start the Server
+```bash
+# Basic usage
+llama-server -m /path/to/model.gguf --port 8080
+# With more options
+llama-server -m /path/to/model.gguf \
+  --port 8080 \
+  -c 4096 \           # Context size
+  -np 4 \             # Parallel requests
+  --threads 8         # CPU threads
+```
+### Basic Usage
+```typescript
+import { LLMService } from 'genai-lite';
+// llama.cpp doesn't need API keys
+const service = new LLMService(async () => 'not-needed');
+const response = await service.sendMessage({
+  providerId: 'llamacpp',
+  modelId: 'llama-3-8b-instruct',  // Arbitrary name matching your model
+  messages: [
+    { role: 'system', content: 'You are a helpful assistant.' },
+    { role: 'user', content: 'Explain quantum computing in simple terms.' }
+  ],
+  settings: {
+    temperature: 0.7,
+    maxTokens: 500
+  }
+});
+if (response.object === 'chat.completion') {
+  console.log(response.choices[0].message.content);
+}
+```
+### Configuration
+#### Environment Variable
+Set the server URL via environment variable (default: `http://localhost:8080`):
+```bash
+export LLAMACPP_API_BASE_URL=http://localhost:8080
+```
+#### Multiple Servers
+Register multiple llama.cpp instances for different models:
+```typescript
+import { LLMService, LlamaCppClientAdapter } from 'genai-lite';
+const service = new LLMService(async () => 'not-needed');
+// Register adapters for different servers/models
+service.registerAdapter(
+  'llamacpp-small',
+  new LlamaCppClientAdapter({ baseURL: 'http://localhost:8080' })
+);
+service.registerAdapter(
+  'llamacpp-large',
+  new LlamaCppClientAdapter({ baseURL: 'http://localhost:8081' })
+);
+// Use them
+const response = await service.sendMessage({
+  providerId: 'llamacpp-small',
+  modelId: 'llama-3-8b',
+  messages: [{ role: 'user', content: 'Hello!' }]
+});
+```
+#### Health Checking
+Enable automatic health checks before requests:
+```typescript
+import { LlamaCppClientAdapter } from 'genai-lite';
+const adapter = new LlamaCppClientAdapter({
+  baseURL: 'http://localhost:8080',
+  checkHealth: true  // Check server status before each request
+});
+service.registerAdapter('llamacpp', adapter);
+```
+### Advanced Features
+#### Server Management
+The `LlamaCppServerClient` class provides access to all llama.cpp server endpoints:
+```typescript
+import { LlamaCppServerClient } from 'genai-lite';
+const client = new LlamaCppServerClient('http://localhost:8080');
+// Health monitoring
+const health = await client.getHealth();
+console.log(health.status); // 'ok', 'loading', or 'error'
+// Server properties
+const props = await client.getProps();
+console.log(props.total_slots); // Number of available slots
+// Performance metrics (if enabled)
+const metrics = await client.getMetrics();
+```
+#### Tokenization
+```typescript
+const client = new LlamaCppServerClient('http://localhost:8080');
+// Tokenize text
+const { tokens } = await client.tokenize('Hello, world!');
+console.log(tokens); // [123, 456, 789]
+// Count tokens before sending to LLM
+const prompt = 'Long text...';
+const { tokens: promptTokens } = await client.tokenize(prompt);
+if (promptTokens.length > 4000) {
+  console.log('Prompt too long, truncating...');
+}
+// Detokenize back to text
+const { content } = await client.detokenize([123, 456, 789]);
+console.log(content); // 'Hello, world!'
+```
+#### Text Embeddings
+```typescript
+const client = new LlamaCppServerClient('http://localhost:8080');
+// Generate embeddings for semantic search
+const { embedding } = await client.createEmbedding('Search query text');
+console.log(embedding.length); // e.g., 768 dimensions
+// With images (for multimodal models)
+const { embedding: multimodalEmbed } = await client.createEmbedding(
+  'Describe this image',
+  'base64_image_data_here'
+);
+```
+#### Code Infilling
+Perfect for code completion in IDEs:
+```typescript
+const client = new LlamaCppServerClient('http://localhost:8080');
+const result = await client.infill(
+  'def calculate_fibonacci(n):\n    ',  // Prefix (before cursor)
+  '\n    return result'                   // Suffix (after cursor)
+);
+console.log(result.content);
+// Output: "if n <= 1:\n        return n\n    result = calculate_fibonacci(n-1) + calculate_fibonacci(n-2)"
+```
+### Error Handling
+```typescript
+const response = await service.sendMessage({
+  providerId: 'llamacpp',
+  modelId: 'my-model',
+  messages: [{ role: 'user', content: 'Hello' }]
+});
+if (response.object === 'error') {
+  switch (response.error.code) {
+    case 'NETWORK_ERROR':
+      console.error('Server not running or unreachable');
+      break;
+    case 'PROVIDER_ERROR':
+      console.error('Server error:', response.error.message);
+      break;
+    default:
+      console.error('Unknown error:', response.error);
+  }
+}
+```
+### Best Practices
+1. **Model Naming**: Use descriptive model IDs (e.g., `llama-3-8b-instruct`) since llama.cpp accepts any name
+2. **Context Size**: Set appropriate context (`-c` flag) when starting the server
+3. **Parallel Requests**: Configure slots (`-np`) based on your hardware
+4. **Health Monitoring**: Enable `checkHealth` for production to detect server issues early
+5. **Resource Management**: Monitor memory usage; large models need significant RAM
+### Troubleshooting
+**Server not responding:**
+```bash
+# Check if server is running
+curl http://localhost:8080/health
+# Should return: {"status":"ok"}
+```
+**Model loading errors:**
+```bash
+# Increase memory or reduce context size
+llama-server -m model.gguf --port 8080 -c 2048
+```
+**Slow responses:**
+```bash
+# Use quantized models (smaller but faster)
+# e.g., Q4_K_M, Q5_K_M instead of F16
+# Increase threads
+llama-server -m model.gguf --threads 16
+```
 ## Using with Electron
 `genai-lite` is designed to work seamlessly within an Electron application's main process, especially when paired with a secure storage solution like `genai-key-storage-lite`.
@@ -709,14 +1056,14 @@ const llmService = new LLMService(electronKeyProvider);
 genai-lite is written in TypeScript and provides comprehensive type definitions:
 ```typescript
-import type {
+import type {
   LLMChatRequest,
   LLMChatRequestWithPreset,
   LLMResponse,
   LLMFailureResponse,
   LLMSettings,
   LLMReasoningSettings,
-  LLMThinkingExtractionSettings,
+  LLMThinkingTagFallbackSettings,
   ApiKeyProvider,
   ModelPreset,
   LLMServiceOptions,
@@ -725,6 +1072,26 @@ import type {
   CreateMessagesResult,
   TemplateMetadata
 } from 'genai-lite';
+// llama.cpp integration types and classes
+import {
+  LlamaCppClientAdapter,
+  LlamaCppServerClient,
+  createFallbackModelInfo
+} from 'genai-lite';
+import type {
+  LlamaCppClientConfig,
+  LlamaCppHealthResponse,
+  LlamaCppTokenizeResponse,
+  LlamaCppDetokenizeResponse,
+  LlamaCppEmbeddingResponse,
+  LlamaCppInfillResponse,
+  LlamaCppPropsResponse,
+  LlamaCppMetricsResponse,
+  LlamaCppSlot,
+  LlamaCppSlotsResponse
+} from 'genai-lite';
 ```
 ## Utilities
@@ -968,24 +1335,23 @@ const { messages } = await llmService.createMessages({
   presetId: 'openai-gpt-4.1-default' // Optional: adds model context
 });
-// Advanced: Leverage model context for adaptive prompts
+// Advanced: Adaptive prompts based on model capabilities
 const { messages, modelContext } = await llmService.createMessages({
   template: `
     <SYSTEM>
-      You are a {{ thinking_enabled ? 'analytical problem solver' : 'quick helper' }}.
-      {{ model_id.includes('claude') ? 'Use your advanced reasoning capabilities.' : '' }}
+      You are a problem-solving assistant.
+      {{ requires_tags_for_thinking ? ' For complex problems, write your reasoning in <thinking> tags before answering.' : '' }}
     </SYSTEM>
-    <USER>
-      {{ thinking_enabled ? 'Please solve this step-by-step:' : 'Please answer:' }}
-      {{ question }}
-    </USER>
+    <USER>{{ question }}</USER>
   `,
+  // Note: Use requires_tags_for_thinking (NOT operator) - only instruct models that don't have active native reasoning
   variables: { question: 'What causes the seasons on Earth?' },
   presetId: 'anthropic-claude-3-7-sonnet-20250219-thinking'
 });
 console.log('Model context:', modelContext);
-// Output: { thinking_enabled: true, thinking_available: true, model_id: 'claude-3-7-sonnet-20250219', ... }
+// Output: { native_reasoning_active: true, native_reasoning_capable: true, model_id: 'claude-3-7-sonnet-20250219', ... }
+// Note: With a reasoning model, the system prompt won't include thinking tag instructions
 ```
 **Low-Level Utilities:**
@@ -1106,6 +1472,10 @@ These utilities enable:
 - **Template Reusability**: Define templates once, use with different variables
 - **Type Safety**: Full TypeScript support with LLMMessage types
+## Examples
+See the **[chat-demo](examples/chat-demo)** application for a complete working example that demonstrates all library features in a production-ready React + Express application.
 ## Contributing
 Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.

package/dist/index.d.ts CHANGED Viewed

@@ -5,7 +5,12 @@ export type { ModelPreset } from "./types/presets";
 export * from "./llm/types";
 export * from "./llm/clients/types";
 export { fromEnvironment } from "./providers/fromEnvironment";
+export { LlamaCppClientAdapter } from "./llm/clients/LlamaCppClientAdapter";
+export { LlamaCppServerClient } from "./llm/clients/LlamaCppServerClient";
+export type { LlamaCppClientConfig, } from "./llm/clients/LlamaCppClientAdapter";
+export type { LlamaCppHealthResponse, LlamaCppTokenizeResponse, LlamaCppDetokenizeResponse, LlamaCppEmbeddingResponse, LlamaCppInfillResponse, LlamaCppPropsResponse, LlamaCppMetricsResponse, LlamaCppSlot, LlamaCppSlotsResponse, } from "./llm/clients/LlamaCppServerClient";
 export { renderTemplate } from "./prompting/template";
 export { countTokens, getSmartPreview, extractRandomVariables } from "./prompting/content";
 export { parseStructuredContent, parseRoleTags, extractInitialTaggedContent, parseTemplateWithMetadata } from "./prompting/parser";
 export type { TemplateMetadata } from "./prompting/parser";
+export { createFallbackModelInfo } from "./llm/config";

package/dist/index.js CHANGED Viewed

@@ -14,7 +14,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
     for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.parseTemplateWithMetadata = exports.extractInitialTaggedContent = exports.parseRoleTags = exports.parseStructuredContent = exports.extractRandomVariables = exports.getSmartPreview = exports.countTokens = exports.renderTemplate = exports.fromEnvironment = exports.LLMService = void 0;
+exports.createFallbackModelInfo = exports.parseTemplateWithMetadata = exports.extractInitialTaggedContent = exports.parseRoleTags = exports.parseStructuredContent = exports.extractRandomVariables = exports.getSmartPreview = exports.countTokens = exports.renderTemplate = exports.LlamaCppServerClient = exports.LlamaCppClientAdapter = exports.fromEnvironment = exports.LLMService = void 0;
 // --- LLM Service ---
 var LLMService_1 = require("./llm/LLMService");
 Object.defineProperty(exports, "LLMService", { enumerable: true, get: function () { return LLMService_1.LLMService; } });
@@ -25,6 +25,11 @@ __exportStar(require("./llm/clients/types"), exports);
 // --- API Key Providers ---
 var fromEnvironment_1 = require("./providers/fromEnvironment");
 Object.defineProperty(exports, "fromEnvironment", { enumerable: true, get: function () { return fromEnvironment_1.fromEnvironment; } });
+// --- llama.cpp Integration ---
+var LlamaCppClientAdapter_1 = require("./llm/clients/LlamaCppClientAdapter");
+Object.defineProperty(exports, "LlamaCppClientAdapter", { enumerable: true, get: function () { return LlamaCppClientAdapter_1.LlamaCppClientAdapter; } });
+var LlamaCppServerClient_1 = require("./llm/clients/LlamaCppServerClient");
+Object.defineProperty(exports, "LlamaCppServerClient", { enumerable: true, get: function () { return LlamaCppServerClient_1.LlamaCppServerClient; } });
 // --- Utilities ---
 var template_1 = require("./prompting/template");
 Object.defineProperty(exports, "renderTemplate", { enumerable: true, get: function () { return template_1.renderTemplate; } });
@@ -37,3 +42,5 @@ Object.defineProperty(exports, "parseStructuredContent", { enumerable: true, get
 Object.defineProperty(exports, "parseRoleTags", { enumerable: true, get: function () { return parser_1.parseRoleTags; } });
 Object.defineProperty(exports, "extractInitialTaggedContent", { enumerable: true, get: function () { return parser_1.extractInitialTaggedContent; } });
 Object.defineProperty(exports, "parseTemplateWithMetadata", { enumerable: true, get: function () { return parser_1.parseTemplateWithMetadata; } });
+var config_1 = require("./llm/config");
+Object.defineProperty(exports, "createFallbackModelInfo", { enumerable: true, get: function () { return config_1.createFallbackModelInfo; } });