llm_conductor 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +13 -1
- data/README.md +172 -2
- data/VISION_USAGE.md +278 -0
- data/examples/openrouter_vision_usage.rb +108 -0
- data/examples/zai_usage.rb +163 -0
- data/lib/llm_conductor/client_factory.rb +4 -1
- data/lib/llm_conductor/clients/openrouter_client.rb +112 -7
- data/lib/llm_conductor/clients/zai_client.rb +153 -0
- data/lib/llm_conductor/configuration.rb +17 -0
- data/lib/llm_conductor/prompt_manager.rb +1 -3
- data/lib/llm_conductor/version.rb +1 -1
- data/lib/llm_conductor.rb +5 -3
- metadata +6 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c6ed179bb9142839bcc6feab8d06d61c27ff8279406bc7839f6d09ba14cb573f
|
|
4
|
+
data.tar.gz: a8ca32fecd9ac81326f7cefcf482f1b6a110b78ca2168c1c8ccbde5e034becb3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 581da83914c51a3966010d03491c3f57be4ed393bb572f2fdc9d0205f8680f4891f2b058ecf7642ea7bf26bea452a976946b6198d0419afb2e771de3bc112aea
|
|
7
|
+
data.tar.gz: 00eb70033cb739b7236b759a30219eb5eb6b72db7bba6c7ee519b98cf186e799cbf4f8696acf237d19a8fbfcca97dd9a189ce4f3b4f8f3d8a7d9ff1729d7eb86
|
data/.rubocop.yml
CHANGED
|
@@ -29,10 +29,15 @@ Style/HashSyntax:
|
|
|
29
29
|
Lint/ConstantDefinitionInBlock:
|
|
30
30
|
Enabled: false
|
|
31
31
|
|
|
32
|
+
Metrics/ClassLength:
|
|
33
|
+
Max: 120
|
|
34
|
+
|
|
32
35
|
Metrics/MethodLength:
|
|
33
36
|
Max: 15
|
|
34
37
|
Exclude:
|
|
35
38
|
- 'lib/llm_conductor/prompts.rb'
|
|
39
|
+
- 'lib/llm_conductor/clients/openrouter_client.rb'
|
|
40
|
+
- 'lib/llm_conductor/clients/zai_client.rb'
|
|
36
41
|
|
|
37
42
|
RSpec/ExampleLength:
|
|
38
43
|
Enabled: false
|
|
@@ -89,17 +94,24 @@ Metrics/BlockLength:
|
|
|
89
94
|
Metrics/AbcSize:
|
|
90
95
|
Exclude:
|
|
91
96
|
- 'lib/llm_conductor/prompts.rb'
|
|
97
|
+
- 'lib/llm_conductor/clients/openrouter_client.rb'
|
|
98
|
+
- 'lib/llm_conductor/clients/zai_client.rb'
|
|
92
99
|
|
|
93
100
|
Metrics/CyclomaticComplexity:
|
|
94
101
|
Exclude:
|
|
102
|
+
- 'lib/llm_conductor.rb'
|
|
95
103
|
- 'lib/llm_conductor/prompts.rb'
|
|
104
|
+
- 'lib/llm_conductor/clients/openrouter_client.rb'
|
|
105
|
+
- 'lib/llm_conductor/clients/zai_client.rb'
|
|
96
106
|
|
|
97
107
|
Metrics/PerceivedComplexity:
|
|
98
108
|
Exclude:
|
|
99
109
|
- 'lib/llm_conductor/prompts.rb'
|
|
110
|
+
- 'lib/llm_conductor/clients/openrouter_client.rb'
|
|
111
|
+
- 'lib/llm_conductor/clients/zai_client.rb'
|
|
100
112
|
|
|
101
113
|
Layout/LineLength:
|
|
102
|
-
Max:
|
|
114
|
+
Max: 125
|
|
103
115
|
|
|
104
116
|
# Performance cops (from .rubocop_todo.yml)
|
|
105
117
|
Performance/RedundantEqualityComparisonBlock:
|
data/README.md
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# LLM Conductor
|
|
2
2
|
|
|
3
|
-
A powerful Ruby gem from [Ekohe](https://ekohe.com) for orchestrating multiple Language Model providers with a unified, modern interface. LLM Conductor provides seamless integration with OpenAI GPT, Anthropic Claude, Google Gemini, Groq, and
|
|
3
|
+
A powerful Ruby gem from [Ekohe](https://ekohe.com) for orchestrating multiple Language Model providers with a unified, modern interface. LLM Conductor provides seamless integration with OpenAI GPT, Anthropic Claude, Google Gemini, Groq, Ollama, OpenRouter, and Z.ai (Zhipu AI) with advanced prompt management, data building patterns, vision/multimodal support, and comprehensive response handling.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
🚀 **Multi-Provider Support** - OpenAI GPT, Anthropic Claude, Google Gemini, Groq, and
|
|
7
|
+
🚀 **Multi-Provider Support** - OpenAI GPT, Anthropic Claude, Google Gemini, Groq, Ollama, OpenRouter, and Z.ai with automatic vendor detection
|
|
8
8
|
🎯 **Unified Modern API** - Simple `LlmConductor.generate()` interface with rich Response objects
|
|
9
|
+
🖼️ **Vision/Multimodal Support** - Send images alongside text prompts for vision-enabled models (OpenRouter, Z.ai GLM-4.5V)
|
|
9
10
|
📝 **Advanced Prompt Management** - Registrable prompt classes with inheritance and templating
|
|
10
11
|
🏗️ **Data Builder Pattern** - Structured data preparation for complex LLM inputs
|
|
11
12
|
⚡ **Smart Configuration** - Rails-style configuration with environment variable support
|
|
@@ -114,6 +115,16 @@ LlmConductor.configure do |config|
|
|
|
114
115
|
base_url: ENV['OLLAMA_ADDRESS'] || 'http://localhost:11434'
|
|
115
116
|
)
|
|
116
117
|
|
|
118
|
+
config.openrouter(
|
|
119
|
+
api_key: ENV['OPENROUTER_API_KEY'],
|
|
120
|
+
uri_base: 'https://openrouter.ai/api/v1' # Optional, this is the default
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
config.zai(
|
|
124
|
+
api_key: ENV['ZAI_API_KEY'],
|
|
125
|
+
uri_base: 'https://api.z.ai/api/paas/v4' # Optional, this is the default
|
|
126
|
+
)
|
|
127
|
+
|
|
117
128
|
# Optional: Configure custom logger
|
|
118
129
|
config.logger = Logger.new($stdout) # Log to stdout
|
|
119
130
|
config.logger = Logger.new('log/llm_conductor.log') # Log to file
|
|
@@ -153,6 +164,8 @@ The gem automatically detects these environment variables:
|
|
|
153
164
|
- `GEMINI_API_KEY` - Google Gemini API key
|
|
154
165
|
- `GROQ_API_KEY` - Groq API key
|
|
155
166
|
- `OLLAMA_ADDRESS` - Ollama server address
|
|
167
|
+
- `OPENROUTER_API_KEY` - OpenRouter API key
|
|
168
|
+
- `ZAI_API_KEY` - Z.ai (Zhipu AI) API key
|
|
156
169
|
|
|
157
170
|
## Supported Providers & Models
|
|
158
171
|
|
|
@@ -223,6 +236,160 @@ response = LlmConductor.generate(
|
|
|
223
236
|
)
|
|
224
237
|
```
|
|
225
238
|
|
|
239
|
+
### OpenRouter (Access to Multiple Providers)
|
|
240
|
+
OpenRouter provides unified access to various LLM providers with automatic routing. It also supports vision/multimodal models with automatic retry logic for handling intermittent availability issues.
|
|
241
|
+
|
|
242
|
+
**Vision-capable models:**
|
|
243
|
+
- `nvidia/nemotron-nano-12b-v2-vl:free` - **FREE** 12B vision model (may need retries)
|
|
244
|
+
- `openai/gpt-4o-mini` - Fast and reliable
|
|
245
|
+
- `google/gemini-flash-1.5` - Fast vision processing
|
|
246
|
+
- `anthropic/claude-3.5-sonnet` - High quality analysis
|
|
247
|
+
- `openai/gpt-4o` - Best quality (higher cost)
|
|
248
|
+
|
|
249
|
+
**Note:** Free-tier models may experience intermittent 502 errors. The client includes automatic retry logic with exponential backoff (up to 5 retries) to handle these transient failures.
|
|
250
|
+
|
|
251
|
+
```ruby
|
|
252
|
+
# Text-only request
|
|
253
|
+
response = LlmConductor.generate(
|
|
254
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
255
|
+
vendor: :openrouter,
|
|
256
|
+
prompt: 'Your prompt here'
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Vision/multimodal request with single image
|
|
260
|
+
response = LlmConductor.generate(
|
|
261
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
262
|
+
vendor: :openrouter,
|
|
263
|
+
prompt: {
|
|
264
|
+
text: 'What is in this image?',
|
|
265
|
+
images: 'https://example.com/image.jpg'
|
|
266
|
+
}
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Vision request with multiple images
|
|
270
|
+
response = LlmConductor.generate(
|
|
271
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
272
|
+
vendor: :openrouter,
|
|
273
|
+
prompt: {
|
|
274
|
+
text: 'Compare these images',
|
|
275
|
+
images: [
|
|
276
|
+
'https://example.com/image1.jpg',
|
|
277
|
+
'https://example.com/image2.jpg'
|
|
278
|
+
]
|
|
279
|
+
}
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# Vision request with detail level
|
|
283
|
+
response = LlmConductor.generate(
|
|
284
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
285
|
+
vendor: :openrouter,
|
|
286
|
+
prompt: {
|
|
287
|
+
text: 'Describe this image in detail',
|
|
288
|
+
images: [
|
|
289
|
+
{ url: 'https://example.com/image.jpg', detail: 'high' }
|
|
290
|
+
]
|
|
291
|
+
}
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Advanced: Raw array format (OpenAI-compatible)
|
|
295
|
+
response = LlmConductor.generate(
|
|
296
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
297
|
+
vendor: :openrouter,
|
|
298
|
+
prompt: [
|
|
299
|
+
{ type: 'text', text: 'What is in this image?' },
|
|
300
|
+
{ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } }
|
|
301
|
+
]
|
|
302
|
+
)
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
**Reliability:** The OpenRouter client includes intelligent retry logic:
|
|
306
|
+
- Automatically retries on 502 errors (up to 5 attempts)
|
|
307
|
+
- Exponential backoff: 2s, 4s, 8s, 16s, 32s
|
|
308
|
+
- Transparent to your code - works seamlessly
|
|
309
|
+
- Enable logging to see retry attempts:
|
|
310
|
+
|
|
311
|
+
```ruby
|
|
312
|
+
LlmConductor.configure do |config|
|
|
313
|
+
config.logger = Logger.new($stdout)
|
|
314
|
+
config.logger.level = Logger::INFO
|
|
315
|
+
end
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### Z.ai (Zhipu AI) - GLM Models with Vision Support
|
|
319
|
+
Z.ai provides access to GLM (General Language Model) series including the powerful GLM-4.5V multimodal model with 64K context window and vision capabilities.
|
|
320
|
+
|
|
321
|
+
**Text models:**
|
|
322
|
+
- `glm-4-plus` - Enhanced text-only model
|
|
323
|
+
- `glm-4` - Standard GLM-4 model
|
|
324
|
+
|
|
325
|
+
**Vision-capable models:**
|
|
326
|
+
- `glm-4.5v` - Latest multimodal model with 64K context ✅ **RECOMMENDED**
|
|
327
|
+
- `glm-4v` - Previous generation vision model
|
|
328
|
+
|
|
329
|
+
```ruby
|
|
330
|
+
# Text-only request with GLM-4-plus
|
|
331
|
+
response = LlmConductor.generate(
|
|
332
|
+
model: 'glm-4-plus',
|
|
333
|
+
vendor: :zai,
|
|
334
|
+
prompt: 'Explain quantum computing in simple terms'
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Vision request with GLM-4.5V - single image
|
|
338
|
+
response = LlmConductor.generate(
|
|
339
|
+
model: 'glm-4.5v',
|
|
340
|
+
vendor: :zai,
|
|
341
|
+
prompt: {
|
|
342
|
+
text: 'What is in this image?',
|
|
343
|
+
images: 'https://example.com/image.jpg'
|
|
344
|
+
}
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Vision request with multiple images
|
|
348
|
+
response = LlmConductor.generate(
|
|
349
|
+
model: 'glm-4.5v',
|
|
350
|
+
vendor: :zai,
|
|
351
|
+
prompt: {
|
|
352
|
+
text: 'Compare these images and identify differences',
|
|
353
|
+
images: [
|
|
354
|
+
'https://example.com/image1.jpg',
|
|
355
|
+
'https://example.com/image2.jpg'
|
|
356
|
+
]
|
|
357
|
+
}
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Vision request with detail level
|
|
361
|
+
response = LlmConductor.generate(
|
|
362
|
+
model: 'glm-4.5v',
|
|
363
|
+
vendor: :zai,
|
|
364
|
+
prompt: {
|
|
365
|
+
text: 'Analyze this document in detail',
|
|
366
|
+
images: [
|
|
367
|
+
{ url: 'https://example.com/document.jpg', detail: 'high' }
|
|
368
|
+
]
|
|
369
|
+
}
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Base64 encoded local images
|
|
373
|
+
require 'base64'
|
|
374
|
+
image_data = Base64.strict_encode64(File.read('path/to/image.jpg'))
|
|
375
|
+
response = LlmConductor.generate(
|
|
376
|
+
model: 'glm-4.5v',
|
|
377
|
+
vendor: :zai,
|
|
378
|
+
prompt: {
|
|
379
|
+
text: 'What is in this image?',
|
|
380
|
+
images: "data:image/jpeg;base64,#{image_data}"
|
|
381
|
+
}
|
|
382
|
+
)
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
**GLM-4.5V Features:**
|
|
386
|
+
- 64K token context window
|
|
387
|
+
- Multimodal understanding (text + images)
|
|
388
|
+
- Document understanding and OCR
|
|
389
|
+
- Image reasoning and analysis
|
|
390
|
+
- Base64 image support for local files
|
|
391
|
+
- OpenAI-compatible API format
|
|
392
|
+
|
|
226
393
|
### Vendor Detection
|
|
227
394
|
|
|
228
395
|
The gem automatically detects the appropriate provider based on model names:
|
|
@@ -230,6 +397,7 @@ The gem automatically detects the appropriate provider based on model names:
|
|
|
230
397
|
- **OpenAI**: Models starting with `gpt-` (e.g., `gpt-4`, `gpt-3.5-turbo`)
|
|
231
398
|
- **Anthropic**: Models starting with `claude-` (e.g., `claude-3-5-sonnet-20241022`)
|
|
232
399
|
- **Google Gemini**: Models starting with `gemini-` (e.g., `gemini-2.5-flash`, `gemini-2.0-flash`)
|
|
400
|
+
- **Z.ai**: Models starting with `glm-` (e.g., `glm-4.5v`, `glm-4-plus`, `glm-4v`)
|
|
233
401
|
- **Groq**: Models starting with `llama`, `mixtral`, `gemma`, or `qwen` (e.g., `llama-3.1-70b-versatile`, `mixtral-8x7b-32768`, `gemma-7b-it`, `qwen-2.5-72b-instruct`)
|
|
234
402
|
- **Ollama**: All other models (e.g., `llama3.2`, `mistral`, `codellama`)
|
|
235
403
|
|
|
@@ -483,6 +651,8 @@ Check the `/examples` directory for comprehensive usage examples:
|
|
|
483
651
|
- `rag_usage.rb` - RAG implementation examples
|
|
484
652
|
- `gemini_usage.rb` - Google Gemini integration
|
|
485
653
|
- `groq_usage.rb` - Groq integration with various models
|
|
654
|
+
- `openrouter_vision_usage.rb` - OpenRouter vision/multimodal examples
|
|
655
|
+
- `zai_usage.rb` - Z.ai GLM-4.5V vision and text examples
|
|
486
656
|
|
|
487
657
|
## Development
|
|
488
658
|
|
data/VISION_USAGE.md
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# Vision/Multimodal Usage Guide
|
|
2
|
+
|
|
3
|
+
This guide explains how to use vision/multimodal capabilities with the OpenRouter and Z.ai clients in LLM Conductor.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### Using OpenRouter
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
require 'llm_conductor'
|
|
11
|
+
|
|
12
|
+
# Configure
|
|
13
|
+
LlmConductor.configure do |config|
|
|
14
|
+
config.openrouter(api_key: ENV['OPENROUTER_API_KEY'])
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Analyze an image
|
|
18
|
+
response = LlmConductor.generate(
|
|
19
|
+
model: 'openai/gpt-4o-mini',
|
|
20
|
+
vendor: :openrouter,
|
|
21
|
+
prompt: {
|
|
22
|
+
text: 'What is in this image?',
|
|
23
|
+
images: 'https://example.com/image.jpg'
|
|
24
|
+
}
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
puts response.output
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Using Z.ai (Zhipu AI)
|
|
31
|
+
|
|
32
|
+
```ruby
|
|
33
|
+
require 'llm_conductor'
|
|
34
|
+
|
|
35
|
+
# Configure
|
|
36
|
+
LlmConductor.configure do |config|
|
|
37
|
+
config.zai(api_key: ENV['ZAI_API_KEY'])
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Analyze an image with GLM-4.5V
|
|
41
|
+
response = LlmConductor.generate(
|
|
42
|
+
model: 'glm-4.5v',
|
|
43
|
+
vendor: :zai,
|
|
44
|
+
prompt: {
|
|
45
|
+
text: 'What is in this image?',
|
|
46
|
+
images: 'https://example.com/image.jpg'
|
|
47
|
+
}
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
puts response.output
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Recommended Models
|
|
54
|
+
|
|
55
|
+
### OpenRouter Models
|
|
56
|
+
|
|
57
|
+
For vision tasks via OpenRouter, these models work reliably:
|
|
58
|
+
|
|
59
|
+
- **`openai/gpt-4o-mini`** - Fast, reliable, good balance of cost/quality ✅
|
|
60
|
+
- **`google/gemini-flash-1.5`** - Fast vision processing
|
|
61
|
+
- **`anthropic/claude-3.5-sonnet`** - High quality analysis
|
|
62
|
+
- **`openai/gpt-4o`** - Best quality (higher cost)
|
|
63
|
+
|
|
64
|
+
### Z.ai Models (Zhipu AI)
|
|
65
|
+
|
|
66
|
+
For vision tasks via Z.ai, these GLM models are recommended:
|
|
67
|
+
|
|
68
|
+
- **`glm-4.5v`** - GLM-4.5V multimodal model (64K context window) ✅
|
|
69
|
+
- **`glm-4-plus`** - Text-only model with enhanced capabilities
|
|
70
|
+
- **`glm-4v`** - Previous generation vision model
|
|
71
|
+
|
|
72
|
+
## Usage Formats
|
|
73
|
+
|
|
74
|
+
### 1. Single Image (Simple Format)
|
|
75
|
+
|
|
76
|
+
```ruby
|
|
77
|
+
response = LlmConductor.generate(
|
|
78
|
+
model: 'openai/gpt-4o-mini',
|
|
79
|
+
vendor: :openrouter,
|
|
80
|
+
prompt: {
|
|
81
|
+
text: 'Describe this image',
|
|
82
|
+
images: 'https://example.com/image.jpg'
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2. Multiple Images
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
response = LlmConductor.generate(
|
|
91
|
+
model: 'openai/gpt-4o-mini',
|
|
92
|
+
vendor: :openrouter,
|
|
93
|
+
prompt: {
|
|
94
|
+
text: 'Compare these images',
|
|
95
|
+
images: [
|
|
96
|
+
'https://example.com/image1.jpg',
|
|
97
|
+
'https://example.com/image2.jpg',
|
|
98
|
+
'https://example.com/image3.jpg'
|
|
99
|
+
]
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### 3. Image with Detail Level
|
|
105
|
+
|
|
106
|
+
For high-resolution images, specify the detail level:
|
|
107
|
+
|
|
108
|
+
```ruby
|
|
109
|
+
response = LlmConductor.generate(
|
|
110
|
+
model: 'openai/gpt-4o-mini',
|
|
111
|
+
vendor: :openrouter,
|
|
112
|
+
prompt: {
|
|
113
|
+
text: 'Analyze this image in detail',
|
|
114
|
+
images: [
|
|
115
|
+
{ url: 'https://example.com/hires-image.jpg', detail: 'high' }
|
|
116
|
+
]
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Detail levels:
|
|
122
|
+
- `'high'` - Better for detailed analysis (uses more tokens)
|
|
123
|
+
- `'low'` - Faster, cheaper (default if not specified)
|
|
124
|
+
- `'auto'` - Let the model decide
|
|
125
|
+
|
|
126
|
+
### 4. Raw Format (Advanced)
|
|
127
|
+
|
|
128
|
+
For maximum control, use the OpenAI-compatible array format:
|
|
129
|
+
|
|
130
|
+
```ruby
|
|
131
|
+
response = LlmConductor.generate(
|
|
132
|
+
model: 'openai/gpt-4o-mini',
|
|
133
|
+
vendor: :openrouter,
|
|
134
|
+
prompt: [
|
|
135
|
+
{ type: 'text', text: 'What is in this image?' },
|
|
136
|
+
{ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } },
|
|
137
|
+
{ type: 'text', text: 'Describe it in detail.' }
|
|
138
|
+
]
|
|
139
|
+
)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Text-Only Requests (Backward Compatible)
|
|
143
|
+
|
|
144
|
+
The client still supports regular text-only requests:
|
|
145
|
+
|
|
146
|
+
```ruby
|
|
147
|
+
response = LlmConductor.generate(
|
|
148
|
+
model: 'openai/gpt-4o-mini',
|
|
149
|
+
vendor: :openrouter,
|
|
150
|
+
prompt: 'What is the capital of France?'
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Image URL Requirements
|
|
155
|
+
|
|
156
|
+
- Images must be publicly accessible URLs
|
|
157
|
+
- Supported formats: JPEG, PNG, GIF, WebP
|
|
158
|
+
- Maximum file size depends on the model
|
|
159
|
+
- Use HTTPS URLs when possible
|
|
160
|
+
|
|
161
|
+
## Error Handling
|
|
162
|
+
|
|
163
|
+
```ruby
|
|
164
|
+
response = LlmConductor.generate(
|
|
165
|
+
model: 'openai/gpt-4o-mini',
|
|
166
|
+
vendor: :openrouter,
|
|
167
|
+
prompt: {
|
|
168
|
+
text: 'Analyze this',
|
|
169
|
+
images: 'https://example.com/image.jpg'
|
|
170
|
+
}
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if response.success?
|
|
174
|
+
puts response.output
|
|
175
|
+
else
|
|
176
|
+
puts "Error: #{response.metadata[:error]}"
|
|
177
|
+
end
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Testing in Development
|
|
181
|
+
|
|
182
|
+
### Interactive Console
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
./bin/console
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Then:
|
|
189
|
+
|
|
190
|
+
```ruby
|
|
191
|
+
LlmConductor.configure do |config|
|
|
192
|
+
config.openrouter(api_key: 'your-key')
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
response = LlmConductor.generate(
|
|
196
|
+
model: 'openai/gpt-4o-mini',
|
|
197
|
+
vendor: :openrouter,
|
|
198
|
+
prompt: {
|
|
199
|
+
text: 'What is this?',
|
|
200
|
+
images: 'https://example.com/image.jpg'
|
|
201
|
+
}
|
|
202
|
+
)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Run Examples
|
|
206
|
+
|
|
207
|
+
For OpenRouter:
|
|
208
|
+
```bash
|
|
209
|
+
export OPENROUTER_API_KEY='your-key'
|
|
210
|
+
ruby examples/openrouter_vision_usage.rb
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
For Z.ai:
|
|
214
|
+
```bash
|
|
215
|
+
export ZAI_API_KEY='your-key'
|
|
216
|
+
ruby examples/zai_usage.rb
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Token Counting
|
|
220
|
+
|
|
221
|
+
Token counting for multimodal requests counts only the text portion. Image tokens vary by:
|
|
222
|
+
- Image size
|
|
223
|
+
- Detail level specified
|
|
224
|
+
- Model being used
|
|
225
|
+
|
|
226
|
+
The gem provides an approximation based on text tokens. For precise billing, check the OpenRouter dashboard.
|
|
227
|
+
|
|
228
|
+
## Common Issues
|
|
229
|
+
|
|
230
|
+
### 502 Server Error
|
|
231
|
+
|
|
232
|
+
If you get a 502 error:
|
|
233
|
+
- The model might be unavailable
|
|
234
|
+
- Try a different model (e.g., switch to `openai/gpt-4o-mini`)
|
|
235
|
+
- Free tier models may be overloaded
|
|
236
|
+
|
|
237
|
+
### "No implicit conversion of Hash into String"
|
|
238
|
+
|
|
239
|
+
This was fixed in the current version. Make sure you're using the latest version of the gem.
|
|
240
|
+
|
|
241
|
+
### Image Not Loading
|
|
242
|
+
|
|
243
|
+
- Verify the URL is publicly accessible
|
|
244
|
+
- Check that the image format is supported
|
|
245
|
+
- Try a smaller image size
|
|
246
|
+
|
|
247
|
+
## Cost Considerations
|
|
248
|
+
|
|
249
|
+
Vision models are more expensive than text-only models. Costs vary by:
|
|
250
|
+
|
|
251
|
+
- **Model choice**: GPT-4o > GPT-4o-mini > Gemini Flash
|
|
252
|
+
- **Detail level**: `high` uses more tokens than `low`
|
|
253
|
+
- **Image count**: Each image adds to the cost
|
|
254
|
+
- **Image size**: Larger images may use more tokens
|
|
255
|
+
|
|
256
|
+
For development, use:
|
|
257
|
+
- `openai/gpt-4o-mini` for cost-effective testing
|
|
258
|
+
- `detail: 'low'` for quick analysis
|
|
259
|
+
- Single images when possible
|
|
260
|
+
|
|
261
|
+
For production:
|
|
262
|
+
- Use `openai/gpt-4o` for best quality
|
|
263
|
+
- Use `detail: 'high'` when needed
|
|
264
|
+
- Monitor costs via OpenRouter dashboard
|
|
265
|
+
|
|
266
|
+
## Examples
|
|
267
|
+
|
|
268
|
+
- `examples/openrouter_vision_usage.rb` - Complete OpenRouter vision examples
|
|
269
|
+
- `examples/zai_usage.rb` - Complete Z.ai GLM-4.5V examples including vision and text
|
|
270
|
+
|
|
271
|
+
## Further Reading
|
|
272
|
+
|
|
273
|
+
- [OpenRouter Documentation](https://openrouter.ai/docs)
|
|
274
|
+
- [OpenAI Vision API Reference](https://platform.openai.com/docs/guides/vision)
|
|
275
|
+
- [Anthropic Claude Vision](https://docs.anthropic.com/claude/docs/vision)
|
|
276
|
+
- [Z.ai API Platform](https://api.z.ai/)
|
|
277
|
+
- [GLM-4.5V Documentation](https://bigmodel.cn/)
|
|
278
|
+
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example of OpenRouter vision/multimodal usage
|
|
5
|
+
require_relative '../lib/llm_conductor'
|
|
6
|
+
|
|
7
|
+
# Configure OpenRouter
|
|
8
|
+
LlmConductor.configure do |config|
|
|
9
|
+
config.openrouter(
|
|
10
|
+
api_key: ENV['OPENROUTER_API_KEY']
|
|
11
|
+
)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Example 1: Simple text-only request (backward compatible)
|
|
15
|
+
puts '=== Example 1: Text-only request ==='
|
|
16
|
+
response = LlmConductor.generate(
|
|
17
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free', # Free vision-capable model
|
|
18
|
+
vendor: :openrouter,
|
|
19
|
+
prompt: 'What is the capital of France?'
|
|
20
|
+
)
|
|
21
|
+
puts response.output
|
|
22
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
23
|
+
|
|
24
|
+
# Example 2: Vision request with a single image
|
|
25
|
+
puts '=== Example 2: Single image analysis ==='
|
|
26
|
+
response = LlmConductor.generate(
|
|
27
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
28
|
+
vendor: :openrouter,
|
|
29
|
+
prompt: {
|
|
30
|
+
text: 'What is in this image?',
|
|
31
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
32
|
+
}
|
|
33
|
+
)
|
|
34
|
+
puts response.output
|
|
35
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
36
|
+
|
|
37
|
+
# Example 3: Vision request with multiple images
|
|
38
|
+
puts '=== Example 3: Multiple images comparison ==='
|
|
39
|
+
response = LlmConductor.generate(
|
|
40
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
41
|
+
vendor: :openrouter,
|
|
42
|
+
prompt: {
|
|
43
|
+
text: 'Compare these two images and describe the differences.',
|
|
44
|
+
images: [
|
|
45
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
46
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Placeholder_view_vector.svg/681px-Placeholder_view_vector.svg.png'
|
|
47
|
+
]
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
puts response.output
|
|
51
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
52
|
+
|
|
53
|
+
# Example 4: Image with detail level specification
|
|
54
|
+
puts '=== Example 4: Image with detail level ==='
|
|
55
|
+
response = LlmConductor.generate(
|
|
56
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
57
|
+
vendor: :openrouter,
|
|
58
|
+
prompt: {
|
|
59
|
+
text: 'Describe this image in detail.',
|
|
60
|
+
images: [
|
|
61
|
+
{
|
|
62
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
63
|
+
detail: 'high'
|
|
64
|
+
}
|
|
65
|
+
]
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
puts response.output
|
|
69
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
70
|
+
|
|
71
|
+
# Example 5: Using raw array format (advanced)
|
|
72
|
+
puts '=== Example 5: Raw array format ==='
|
|
73
|
+
response = LlmConductor.generate(
|
|
74
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
75
|
+
vendor: :openrouter,
|
|
76
|
+
prompt: [
|
|
77
|
+
{ type: 'text', text: 'What is in this image?' },
|
|
78
|
+
{
|
|
79
|
+
type: 'image_url',
|
|
80
|
+
image_url: {
|
|
81
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
)
|
|
86
|
+
puts response.output
|
|
87
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
88
|
+
|
|
89
|
+
# Example 6: Error handling
|
|
90
|
+
puts '=== Example 6: Error handling ==='
|
|
91
|
+
begin
|
|
92
|
+
response = LlmConductor.generate(
|
|
93
|
+
model: 'nvidia/nemotron-nano-12b-v2-vl:free',
|
|
94
|
+
vendor: :openrouter,
|
|
95
|
+
prompt: {
|
|
96
|
+
text: 'Analyze this image',
|
|
97
|
+
images: 'invalid-url'
|
|
98
|
+
}
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if response.success?
|
|
102
|
+
puts response.output
|
|
103
|
+
else
|
|
104
|
+
puts "Error: #{response.metadata[:error]}"
|
|
105
|
+
end
|
|
106
|
+
rescue StandardError => e
|
|
107
|
+
puts "Exception: #{e.message}"
|
|
108
|
+
end
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example of Z.ai GLM model usage including multimodal/vision capabilities
|
|
5
|
+
require_relative '../lib/llm_conductor'
|
|
6
|
+
|
|
7
|
+
# Configure Z.ai
|
|
8
|
+
LlmConductor.configure do |config|
|
|
9
|
+
config.zai(
|
|
10
|
+
api_key: ENV['ZAI_API_KEY']
|
|
11
|
+
)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Example 1: Simple text-only request with GLM-4-plus
|
|
15
|
+
puts '=== Example 1: Text-only request with GLM-4-plus ==='
|
|
16
|
+
response = LlmConductor.generate(
|
|
17
|
+
model: 'glm-4-plus',
|
|
18
|
+
vendor: :zai,
|
|
19
|
+
prompt: 'What is the capital of France? Please answer in one sentence.'
|
|
20
|
+
)
|
|
21
|
+
puts response.output
|
|
22
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
23
|
+
|
|
24
|
+
# Example 2: Text request with GLM-4.5V (vision model, text-only mode)
|
|
25
|
+
puts '=== Example 2: Text-only request with GLM-4.5V ==='
|
|
26
|
+
response = LlmConductor.generate(
|
|
27
|
+
model: 'glm-4.5v',
|
|
28
|
+
vendor: :zai,
|
|
29
|
+
prompt: 'Explain the concept of machine learning in simple terms.'
|
|
30
|
+
)
|
|
31
|
+
puts response.output
|
|
32
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
33
|
+
|
|
34
|
+
# Example 3: Vision request with a single image
|
|
35
|
+
puts '=== Example 3: Single image analysis with GLM-4.5V ==='
|
|
36
|
+
response = LlmConductor.generate(
|
|
37
|
+
model: 'glm-4.5v',
|
|
38
|
+
vendor: :zai,
|
|
39
|
+
prompt: {
|
|
40
|
+
text: 'What do you see in this image? Please describe it in detail.',
|
|
41
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
puts response.output
|
|
45
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
46
|
+
|
|
47
|
+
# Example 4: Vision request with multiple images
|
|
48
|
+
puts '=== Example 4: Multiple images comparison with GLM-4.5V ==='
|
|
49
|
+
response = LlmConductor.generate(
|
|
50
|
+
model: 'glm-4.5v',
|
|
51
|
+
vendor: :zai,
|
|
52
|
+
prompt: {
|
|
53
|
+
text: 'Compare these two images and describe the differences you observe.',
|
|
54
|
+
images: [
|
|
55
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
56
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Placeholder_view_vector.svg/681px-Placeholder_view_vector.svg.png'
|
|
57
|
+
]
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
puts response.output
|
|
61
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
62
|
+
|
|
63
|
+
# Example 5: Image with detail level specification
|
|
64
|
+
puts '=== Example 5: Image with detail level ==='
|
|
65
|
+
response = LlmConductor.generate(
|
|
66
|
+
model: 'glm-4.5v',
|
|
67
|
+
vendor: :zai,
|
|
68
|
+
prompt: {
|
|
69
|
+
text: 'Describe this image in detail, including colors, objects, and atmosphere.',
|
|
70
|
+
images: [
|
|
71
|
+
{
|
|
72
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
73
|
+
detail: 'high'
|
|
74
|
+
}
|
|
75
|
+
]
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
puts response.output
|
|
79
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
80
|
+
|
|
81
|
+
# Example 6: Using raw array format (advanced)
|
|
82
|
+
puts '=== Example 6: Raw array format ==='
|
|
83
|
+
response = LlmConductor.generate(
|
|
84
|
+
model: 'glm-4.5v',
|
|
85
|
+
vendor: :zai,
|
|
86
|
+
prompt: [
|
|
87
|
+
{ type: 'text', text: 'What objects can you identify in this image?' },
|
|
88
|
+
{
|
|
89
|
+
type: 'image_url',
|
|
90
|
+
image_url: {
|
|
91
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
)
|
|
96
|
+
puts response.output
|
|
97
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
98
|
+
|
|
99
|
+
# Example 7: Base64 encoded image (for local images)
|
|
100
|
+
puts '=== Example 7: Using base64 encoded image ==='
|
|
101
|
+
# NOTE: In real usage, you would read and encode a local file
|
|
102
|
+
# require 'base64'
|
|
103
|
+
# image_data = Base64.strict_encode64(File.read('path/to/image.jpg'))
|
|
104
|
+
# image_url = "data:image/jpeg;base64,#{image_data}"
|
|
105
|
+
|
|
106
|
+
# For this example, we'll use a URL
|
|
107
|
+
response = LlmConductor.generate(
|
|
108
|
+
model: 'glm-4.5v',
|
|
109
|
+
vendor: :zai,
|
|
110
|
+
prompt: {
|
|
111
|
+
text: 'Analyze this image and extract any text you can see.',
|
|
112
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
puts response.output
|
|
116
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
117
|
+
|
|
118
|
+
# Example 8: Error handling
|
|
119
|
+
puts '=== Example 8: Error handling ==='
|
|
120
|
+
begin
|
|
121
|
+
response = LlmConductor.generate(
|
|
122
|
+
model: 'glm-4.5v',
|
|
123
|
+
vendor: :zai,
|
|
124
|
+
prompt: {
|
|
125
|
+
text: 'Analyze this image',
|
|
126
|
+
images: 'invalid-url'
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if response.success?
|
|
131
|
+
puts response.output
|
|
132
|
+
else
|
|
133
|
+
puts "Error: #{response.metadata[:error]}"
|
|
134
|
+
end
|
|
135
|
+
rescue StandardError => e
|
|
136
|
+
puts "Exception: #{e.message}"
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Example 9: Document understanding (OCR)
|
|
140
|
+
puts "\n=== Example 9: Document understanding ==="
|
|
141
|
+
response = LlmConductor.generate(
|
|
142
|
+
model: 'glm-4.5v',
|
|
143
|
+
vendor: :zai,
|
|
144
|
+
prompt: {
|
|
145
|
+
text: 'Please read any text visible in this image and transcribe it.',
|
|
146
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
puts response.output
|
|
150
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
151
|
+
|
|
152
|
+
# Example 10: Complex reasoning with image
|
|
153
|
+
puts '=== Example 10: Complex reasoning with image ==='
|
|
154
|
+
response = LlmConductor.generate(
|
|
155
|
+
model: 'glm-4.5v',
|
|
156
|
+
vendor: :zai,
|
|
157
|
+
prompt: {
|
|
158
|
+
text: 'Based on this image, what time of day do you think it is? Explain your reasoning.',
|
|
159
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
160
|
+
}
|
|
161
|
+
)
|
|
162
|
+
puts response.output
|
|
163
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
@@ -19,7 +19,8 @@ module LlmConductor
|
|
|
19
19
|
ollama: Clients::OllamaClient,
|
|
20
20
|
gemini: Clients::GeminiClient,
|
|
21
21
|
google: Clients::GeminiClient,
|
|
22
|
-
groq: Clients::GroqClient
|
|
22
|
+
groq: Clients::GroqClient,
|
|
23
|
+
zai: Clients::ZaiClient
|
|
23
24
|
}
|
|
24
25
|
|
|
25
26
|
client_classes.fetch(vendor) do
|
|
@@ -35,6 +36,8 @@ module LlmConductor
|
|
|
35
36
|
:openai
|
|
36
37
|
when /^gemini/i
|
|
37
38
|
:gemini
|
|
39
|
+
when /^glm/i
|
|
40
|
+
:zai
|
|
38
41
|
when /^(llama|mixtral|gemma|qwen)/i
|
|
39
42
|
:groq
|
|
40
43
|
else
|
|
@@ -3,17 +3,122 @@
|
|
|
3
3
|
module LlmConductor
|
|
4
4
|
module Clients
|
|
5
5
|
# OpenRouter client implementation for accessing various LLM providers through OpenRouter API
|
|
6
|
+
# Supports both text-only and multimodal (vision) requests
|
|
6
7
|
class OpenrouterClient < BaseClient
|
|
7
8
|
private
|
|
8
9
|
|
|
10
|
+
# Override token calculation to handle multimodal content
|
|
11
|
+
def calculate_tokens(content)
|
|
12
|
+
case content
|
|
13
|
+
when String
|
|
14
|
+
super(content)
|
|
15
|
+
when Hash
|
|
16
|
+
# For multimodal content, count tokens only for text part
|
|
17
|
+
# Note: This is an approximation as images have variable token counts
|
|
18
|
+
text = content[:text] || content['text'] || ''
|
|
19
|
+
super(text)
|
|
20
|
+
when Array
|
|
21
|
+
# For pre-formatted arrays, extract and count text parts
|
|
22
|
+
text_parts = content.select { |part| part[:type] == 'text' || part['type'] == 'text' }
|
|
23
|
+
.map { |part| part[:text] || part['text'] || '' }
|
|
24
|
+
.join(' ')
|
|
25
|
+
super(text_parts)
|
|
26
|
+
else
|
|
27
|
+
super(content.to_s)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
9
31
|
def generate_content(prompt)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
32
|
+
content = format_content(prompt)
|
|
33
|
+
|
|
34
|
+
# Retry logic for transient 502 errors (common with free-tier models)
|
|
35
|
+
# Free-tier vision models can be slow/overloaded, so we use more retries
|
|
36
|
+
max_retries = 5
|
|
37
|
+
retry_count = 0
|
|
38
|
+
|
|
39
|
+
begin
|
|
40
|
+
client.chat(
|
|
41
|
+
parameters: {
|
|
42
|
+
model:,
|
|
43
|
+
messages: [{ role: 'user', content: }],
|
|
44
|
+
provider: { sort: 'throughput' }
|
|
45
|
+
}
|
|
46
|
+
).dig('choices', 0, 'message', 'content')
|
|
47
|
+
rescue Faraday::ServerError => e
|
|
48
|
+
retry_count += 1
|
|
49
|
+
|
|
50
|
+
# Log retry attempts if logger is configured
|
|
51
|
+
configuration.logger&.warn(
|
|
52
|
+
"OpenRouter API error (attempt #{retry_count}/#{max_retries}): #{e.message}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
raise unless e.response[:status] == 502 && retry_count < max_retries
|
|
56
|
+
|
|
57
|
+
wait_time = 2**retry_count # Exponential backoff: 2, 4, 8, 16, 32 seconds
|
|
58
|
+
configuration.logger&.info("Retrying in #{wait_time}s...")
|
|
59
|
+
sleep(wait_time)
|
|
60
|
+
retry
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Format content based on whether it's a simple string or multimodal content
|
|
65
|
+
# @param prompt [String, Hash, Array] The prompt content
|
|
66
|
+
# @return [String, Array] Formatted content for the API
|
|
67
|
+
def format_content(prompt)
|
|
68
|
+
case prompt
|
|
69
|
+
when Hash
|
|
70
|
+
# Handle hash with text and/or images
|
|
71
|
+
format_multimodal_hash(prompt)
|
|
72
|
+
when Array
|
|
73
|
+
# Already formatted as array of content parts
|
|
74
|
+
prompt
|
|
75
|
+
else
|
|
76
|
+
# Simple string prompt
|
|
77
|
+
prompt.to_s
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Format a hash containing text and/or images into multimodal content array
|
|
82
|
+
# @param prompt_hash [Hash] Hash with :text and/or :images keys
|
|
83
|
+
# @return [Array] Array of content parts for the API
|
|
84
|
+
def format_multimodal_hash(prompt_hash)
|
|
85
|
+
content_parts = []
|
|
86
|
+
|
|
87
|
+
# Add text part if present
|
|
88
|
+
if prompt_hash[:text] || prompt_hash['text']
|
|
89
|
+
text = prompt_hash[:text] || prompt_hash['text']
|
|
90
|
+
content_parts << { type: 'text', text: }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Add image parts if present
|
|
94
|
+
images = prompt_hash[:images] || prompt_hash['images'] || []
|
|
95
|
+
images = [images] unless images.is_a?(Array)
|
|
96
|
+
|
|
97
|
+
images.each do |image|
|
|
98
|
+
content_parts << format_image_part(image)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
content_parts
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Format an image into the appropriate API structure
|
|
105
|
+
# @param image [String, Hash] Image URL or hash with url/detail keys
|
|
106
|
+
# @return [Hash] Formatted image part for the API
|
|
107
|
+
def format_image_part(image)
|
|
108
|
+
case image
|
|
109
|
+
when String
|
|
110
|
+
# Simple URL string
|
|
111
|
+
{ type: 'image_url', image_url: { url: image } }
|
|
112
|
+
when Hash
|
|
113
|
+
# Hash with url and optional detail level
|
|
114
|
+
{
|
|
115
|
+
type: 'image_url',
|
|
116
|
+
image_url: {
|
|
117
|
+
url: image[:url] || image['url'],
|
|
118
|
+
detail: image[:detail] || image['detail']
|
|
119
|
+
}.compact
|
|
15
120
|
}
|
|
16
|
-
|
|
121
|
+
end
|
|
17
122
|
end
|
|
18
123
|
|
|
19
124
|
def client
|
|
@@ -21,7 +126,7 @@ module LlmConductor
|
|
|
21
126
|
config = LlmConductor.configuration.provider_config(:openrouter)
|
|
22
127
|
OpenAI::Client.new(
|
|
23
128
|
access_token: config[:api_key],
|
|
24
|
-
uri_base: config[:uri_base] || 'https://openrouter.ai/api/'
|
|
129
|
+
uri_base: config[:uri_base] || 'https://openrouter.ai/api/v1'
|
|
25
130
|
)
|
|
26
131
|
end
|
|
27
132
|
end
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmConductor
|
|
4
|
+
module Clients
|
|
5
|
+
# Z.ai client implementation for accessing GLM models including GLM-4.5V
|
|
6
|
+
# Supports both text-only and multimodal (vision) requests
|
|
7
|
+
#
|
|
8
|
+
# Note: Z.ai uses OpenAI-compatible API format but with /v4/ path instead of /v1/
|
|
9
|
+
# We use Faraday directly instead of the ruby-openai gem to properly handle the API path
|
|
10
|
+
class ZaiClient < BaseClient
|
|
11
|
+
private
|
|
12
|
+
|
|
13
|
+
# Override token calculation to handle multimodal content
|
|
14
|
+
def calculate_tokens(content)
|
|
15
|
+
case content
|
|
16
|
+
when String
|
|
17
|
+
super(content)
|
|
18
|
+
when Hash
|
|
19
|
+
# For multimodal content, count tokens only for text part
|
|
20
|
+
# Note: This is an approximation as images have variable token counts
|
|
21
|
+
text = content[:text] || content['text'] || ''
|
|
22
|
+
super(text)
|
|
23
|
+
when Array
|
|
24
|
+
# For pre-formatted arrays, extract and count text parts
|
|
25
|
+
text_parts = content.select { |part| part[:type] == 'text' || part['type'] == 'text' }
|
|
26
|
+
.map { |part| part[:text] || part['text'] || '' }
|
|
27
|
+
.join(' ')
|
|
28
|
+
super(text_parts)
|
|
29
|
+
else
|
|
30
|
+
super(content.to_s)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def generate_content(prompt)
|
|
35
|
+
content = format_content(prompt)
|
|
36
|
+
|
|
37
|
+
# Retry logic for transient errors (similar to OpenRouter)
|
|
38
|
+
max_retries = 3
|
|
39
|
+
retry_count = 0
|
|
40
|
+
|
|
41
|
+
begin
|
|
42
|
+
# Make direct HTTP request to Z.ai API since they use /v4/ instead of /v1/
|
|
43
|
+
response = http_client.post('chat/completions') do |req|
|
|
44
|
+
req.body = {
|
|
45
|
+
model:,
|
|
46
|
+
messages: [{ role: 'user', content: }]
|
|
47
|
+
}.to_json
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Response body is already parsed as Hash by Faraday's JSON middleware
|
|
51
|
+
response_data = response.body.is_a?(String) ? JSON.parse(response.body) : response.body
|
|
52
|
+
response_data.dig('choices', 0, 'message', 'content')
|
|
53
|
+
rescue Faraday::ServerError => e
|
|
54
|
+
retry_count += 1
|
|
55
|
+
|
|
56
|
+
# Log retry attempts if logger is configured
|
|
57
|
+
configuration.logger&.warn(
|
|
58
|
+
"Z.ai API error (attempt #{retry_count}/#{max_retries}): #{e.message}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
raise unless retry_count < max_retries
|
|
62
|
+
|
|
63
|
+
wait_time = 2**retry_count # Exponential backoff: 2, 4, 8 seconds
|
|
64
|
+
configuration.logger&.info("Retrying in #{wait_time}s...")
|
|
65
|
+
sleep(wait_time)
|
|
66
|
+
retry
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Format content based on whether it's a simple string or multimodal content
|
|
71
|
+
# @param prompt [String, Hash, Array] The prompt content
|
|
72
|
+
# @return [String, Array] Formatted content for the API
|
|
73
|
+
def format_content(prompt)
|
|
74
|
+
case prompt
|
|
75
|
+
when Hash
|
|
76
|
+
# Handle hash with text and/or images
|
|
77
|
+
format_multimodal_hash(prompt)
|
|
78
|
+
when Array
|
|
79
|
+
# Already formatted as array of content parts
|
|
80
|
+
prompt
|
|
81
|
+
else
|
|
82
|
+
# Simple string prompt
|
|
83
|
+
prompt.to_s
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Format a hash containing text and/or images into multimodal content array
|
|
88
|
+
# @param prompt_hash [Hash] Hash with :text and/or :images keys
|
|
89
|
+
# @return [Array] Array of content parts for the API
|
|
90
|
+
def format_multimodal_hash(prompt_hash)
|
|
91
|
+
content_parts = []
|
|
92
|
+
|
|
93
|
+
# Add text part if present
|
|
94
|
+
if prompt_hash[:text] || prompt_hash['text']
|
|
95
|
+
text = prompt_hash[:text] || prompt_hash['text']
|
|
96
|
+
content_parts << { type: 'text', text: }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Add image parts if present
|
|
100
|
+
images = prompt_hash[:images] || prompt_hash['images'] || []
|
|
101
|
+
images = [images] unless images.is_a?(Array)
|
|
102
|
+
|
|
103
|
+
images.each do |image|
|
|
104
|
+
content_parts << format_image_part(image)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
content_parts
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Format an image into the appropriate API structure
|
|
111
|
+
# @param image [String, Hash] Image URL or hash with url/detail keys
|
|
112
|
+
# @return [Hash] Formatted image part for the API
|
|
113
|
+
def format_image_part(image)
|
|
114
|
+
case image
|
|
115
|
+
when String
|
|
116
|
+
# Simple URL string or base64 data
|
|
117
|
+
{ type: 'image_url', image_url: { url: image } }
|
|
118
|
+
when Hash
|
|
119
|
+
# Hash with url and optional detail level
|
|
120
|
+
{
|
|
121
|
+
type: 'image_url',
|
|
122
|
+
image_url: {
|
|
123
|
+
url: image[:url] || image['url'],
|
|
124
|
+
detail: image[:detail] || image['detail']
|
|
125
|
+
}.compact
|
|
126
|
+
}
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# HTTP client for making requests to Z.ai API
|
|
131
|
+
# Z.ai uses /v4/ in their path, not /v1/ like OpenAI, so we use Faraday directly
|
|
132
|
+
def http_client
|
|
133
|
+
@http_client ||= begin
|
|
134
|
+
config = LlmConductor.configuration.provider_config(:zai)
|
|
135
|
+
base_url = config[:uri_base] || 'https://api.z.ai/api/paas/v4'
|
|
136
|
+
|
|
137
|
+
Faraday.new(url: base_url) do |f|
|
|
138
|
+
f.request :json
|
|
139
|
+
f.response :json
|
|
140
|
+
f.headers['Authorization'] = "Bearer #{config[:api_key]}"
|
|
141
|
+
f.headers['Content-Type'] = 'application/json'
|
|
142
|
+
f.adapter Faraday.default_adapter
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Legacy client method for compatibility (not used, but kept for reference)
|
|
148
|
+
def client
|
|
149
|
+
http_client
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
@@ -72,6 +72,14 @@ module LlmConductor
|
|
|
72
72
|
}
|
|
73
73
|
end
|
|
74
74
|
|
|
75
|
+
# Configure Z.ai provider
|
|
76
|
+
def zai(api_key: nil, **options)
|
|
77
|
+
@providers[:zai] = {
|
|
78
|
+
api_key: api_key || ENV['ZAI_API_KEY'],
|
|
79
|
+
**options
|
|
80
|
+
}
|
|
81
|
+
end
|
|
82
|
+
|
|
75
83
|
# Get provider configuration
|
|
76
84
|
def provider_config(provider)
|
|
77
85
|
@providers[provider.to_sym] || {}
|
|
@@ -126,6 +134,14 @@ module LlmConductor
|
|
|
126
134
|
groq(api_key: value)
|
|
127
135
|
end
|
|
128
136
|
|
|
137
|
+
def zai_api_key
|
|
138
|
+
provider_config(:zai)[:api_key]
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def zai_api_key=(value)
|
|
142
|
+
zai(api_key: value)
|
|
143
|
+
end
|
|
144
|
+
|
|
129
145
|
private
|
|
130
146
|
|
|
131
147
|
def setup_defaults_from_env
|
|
@@ -135,6 +151,7 @@ module LlmConductor
|
|
|
135
151
|
openrouter if ENV['OPENROUTER_API_KEY']
|
|
136
152
|
gemini if ENV['GEMINI_API_KEY']
|
|
137
153
|
groq if ENV['GROQ_API_KEY']
|
|
154
|
+
zai if ENV['ZAI_API_KEY']
|
|
138
155
|
ollama # Always configure Ollama with default URL
|
|
139
156
|
end
|
|
140
157
|
end
|
|
@@ -59,9 +59,7 @@ module LlmConductor
|
|
|
59
59
|
def validate_prompt_class!(prompt_class)
|
|
60
60
|
raise InvalidPromptClassError, 'Prompt must be a class' unless prompt_class.is_a?(Class)
|
|
61
61
|
|
|
62
|
-
unless prompt_class < Prompts::BasePrompt
|
|
63
|
-
raise InvalidPromptClassError, 'Prompt class must inherit from BasePrompt'
|
|
64
|
-
end
|
|
62
|
+
raise InvalidPromptClassError, 'Prompt class must inherit from BasePrompt' unless prompt_class < Prompts::BasePrompt
|
|
65
63
|
|
|
66
64
|
return if prompt_class.instance_methods(false).include?(:render)
|
|
67
65
|
|
data/lib/llm_conductor.rb
CHANGED
|
@@ -14,10 +14,11 @@ require_relative 'llm_conductor/clients/groq_client'
|
|
|
14
14
|
require_relative 'llm_conductor/clients/ollama_client'
|
|
15
15
|
require_relative 'llm_conductor/clients/openrouter_client'
|
|
16
16
|
require_relative 'llm_conductor/clients/gemini_client'
|
|
17
|
+
require_relative 'llm_conductor/clients/zai_client'
|
|
17
18
|
require_relative 'llm_conductor/client_factory'
|
|
18
19
|
|
|
19
20
|
# LLM Conductor provides a unified interface for multiple Language Model providers
|
|
20
|
-
# including OpenAI GPT, Anthropic Claude, Google Gemini, Groq, OpenRouter, and Ollama
|
|
21
|
+
# including OpenAI GPT, Anthropic Claude, Google Gemini, Groq, OpenRouter, Z.ai, and Ollama
|
|
21
22
|
# with built-in prompt templates, token counting, and extensible client architecture.
|
|
22
23
|
module LlmConductor
|
|
23
24
|
class Error < StandardError; end
|
|
@@ -63,16 +64,17 @@ module LlmConductor
|
|
|
63
64
|
when :ollama then Clients::OllamaClient
|
|
64
65
|
when :gemini, :google then Clients::GeminiClient
|
|
65
66
|
when :groq then Clients::GroqClient
|
|
67
|
+
when :zai then Clients::ZaiClient
|
|
66
68
|
else
|
|
67
69
|
raise ArgumentError,
|
|
68
70
|
"Unsupported vendor: #{vendor}. " \
|
|
69
|
-
'Supported vendors: anthropic, openai, openrouter, ollama, gemini, groq'
|
|
71
|
+
'Supported vendors: anthropic, openai, openrouter, ollama, gemini, groq, zai'
|
|
70
72
|
end
|
|
71
73
|
end
|
|
72
74
|
end
|
|
73
75
|
|
|
74
76
|
# List of supported vendors
|
|
75
|
-
SUPPORTED_VENDORS = %i[anthropic openai openrouter ollama gemini groq].freeze
|
|
77
|
+
SUPPORTED_VENDORS = %i[anthropic openai openrouter ollama gemini groq zai].freeze
|
|
76
78
|
|
|
77
79
|
# List of supported prompt types
|
|
78
80
|
SUPPORTED_PROMPT_TYPES = %i[
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llm_conductor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ben Zheng
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-10-
|
|
10
|
+
date: 2025-10-29 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: activesupport
|
|
@@ -152,13 +152,16 @@ files:
|
|
|
152
152
|
- LICENSE
|
|
153
153
|
- README.md
|
|
154
154
|
- Rakefile
|
|
155
|
+
- VISION_USAGE.md
|
|
155
156
|
- config/initializers/llm_conductor.rb
|
|
156
157
|
- examples/data_builder_usage.rb
|
|
157
158
|
- examples/gemini_usage.rb
|
|
158
159
|
- examples/groq_usage.rb
|
|
160
|
+
- examples/openrouter_vision_usage.rb
|
|
159
161
|
- examples/prompt_registration.rb
|
|
160
162
|
- examples/rag_usage.rb
|
|
161
163
|
- examples/simple_usage.rb
|
|
164
|
+
- examples/zai_usage.rb
|
|
162
165
|
- lib/llm_conductor.rb
|
|
163
166
|
- lib/llm_conductor/client_factory.rb
|
|
164
167
|
- lib/llm_conductor/clients/anthropic_client.rb
|
|
@@ -168,6 +171,7 @@ files:
|
|
|
168
171
|
- lib/llm_conductor/clients/groq_client.rb
|
|
169
172
|
- lib/llm_conductor/clients/ollama_client.rb
|
|
170
173
|
- lib/llm_conductor/clients/openrouter_client.rb
|
|
174
|
+
- lib/llm_conductor/clients/zai_client.rb
|
|
171
175
|
- lib/llm_conductor/configuration.rb
|
|
172
176
|
- lib/llm_conductor/data_builder.rb
|
|
173
177
|
- lib/llm_conductor/prompt_manager.rb
|