llm_conductor 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -0
- data/VISION_USAGE.md +100 -8
- data/examples/claude_vision_usage.rb +138 -0
- data/examples/gpt_vision_usage.rb +156 -0
- data/lib/llm_conductor/clients/anthropic_client.rb +28 -1
- data/lib/llm_conductor/clients/concerns/vision_support.rb +159 -0
- data/lib/llm_conductor/clients/gpt_client.rb +7 -1
- data/lib/llm_conductor/clients/openrouter_client.rb +4 -81
- data/lib/llm_conductor/clients/zai_client.rb +4 -81
- data/lib/llm_conductor/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bce592da24b8bb09f9702361a8d2de5051092290dd3b263f0026ddb877a8717b
|
|
4
|
+
data.tar.gz: 364a233ac3b1490010d949e15f83a3c45a5750ed117674ae2498508884cc365a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3ea0a7fc5d5fe1f729e6eb76b9b81eb5b24aaad96ba59ef954637e00184eded4f6fd44c591ee3921f86dd3131403fc496a77b355bd59e60158849c2e3af44511
|
|
7
|
+
data.tar.gz: 322cfca7d9e8917761af1b5de1033d9c11f58fceaa8d79aa18feee6b65050c4ab123c340479d44adeafa42f85b25c9e406e17ffda0af0ed6f871cb3d4d7d682f
|
data/.rubocop.yml
CHANGED
data/VISION_USAGE.md
CHANGED
|
@@ -1,9 +1,55 @@
|
|
|
1
1
|
# Vision/Multimodal Usage Guide
|
|
2
2
|
|
|
3
|
-
This guide explains how to use vision/multimodal capabilities with
|
|
3
|
+
This guide explains how to use vision/multimodal capabilities with LLM Conductor. Vision support is available for Claude (Anthropic), GPT (OpenAI), OpenRouter, and Z.ai clients.
|
|
4
4
|
|
|
5
5
|
## Quick Start
|
|
6
6
|
|
|
7
|
+
### Using Claude (Anthropic)
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
require 'llm_conductor'
|
|
11
|
+
|
|
12
|
+
# Configure
|
|
13
|
+
LlmConductor.configure do |config|
|
|
14
|
+
config.anthropic(api_key: ENV['ANTHROPIC_API_KEY'])
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Analyze an image
|
|
18
|
+
response = LlmConductor.generate(
|
|
19
|
+
model: 'claude-sonnet-4-20250514',
|
|
20
|
+
vendor: :anthropic,
|
|
21
|
+
prompt: {
|
|
22
|
+
text: 'What is in this image?',
|
|
23
|
+
images: 'https://example.com/image.jpg'
|
|
24
|
+
}
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
puts response.output
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Using GPT (OpenAI)
|
|
31
|
+
|
|
32
|
+
```ruby
|
|
33
|
+
require 'llm_conductor'
|
|
34
|
+
|
|
35
|
+
# Configure
|
|
36
|
+
LlmConductor.configure do |config|
|
|
37
|
+
config.openai(api_key: ENV['OPENAI_API_KEY'])
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Analyze an image
|
|
41
|
+
response = LlmConductor.generate(
|
|
42
|
+
model: 'gpt-4o',
|
|
43
|
+
vendor: :openai,
|
|
44
|
+
prompt: {
|
|
45
|
+
text: 'What is in this image?',
|
|
46
|
+
images: 'https://example.com/image.jpg'
|
|
47
|
+
}
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
puts response.output
|
|
51
|
+
```
|
|
52
|
+
|
|
7
53
|
### Using OpenRouter
|
|
8
54
|
|
|
9
55
|
```ruby
|
|
@@ -52,6 +98,23 @@ puts response.output
|
|
|
52
98
|
|
|
53
99
|
## Recommended Models
|
|
54
100
|
|
|
101
|
+
### Claude Models (Anthropic)
|
|
102
|
+
|
|
103
|
+
For vision tasks via Anthropic API:
|
|
104
|
+
|
|
105
|
+
- **`claude-sonnet-4-20250514`** - Claude Sonnet 4 (latest, best for vision) ✅
|
|
106
|
+
- **`claude-opus-4-20250514`** - Claude Opus 4 (maximum quality)
|
|
107
|
+
- **`claude-opus-4-1-20250805`** - Claude Opus 4.1 (newest flagship model)
|
|
108
|
+
|
|
109
|
+
### GPT Models (OpenAI)
|
|
110
|
+
|
|
111
|
+
For vision tasks via OpenAI API:
|
|
112
|
+
|
|
113
|
+
- **`gpt-4o`** - Latest GPT-4 Omni with advanced vision capabilities ✅
|
|
114
|
+
- **`gpt-4o-mini`** - Fast, cost-effective vision model
|
|
115
|
+
- **`gpt-4-turbo`** - Previous generation with vision support
|
|
116
|
+
- **`gpt-4-vision-preview`** - Legacy vision model (deprecated)
|
|
117
|
+
|
|
55
118
|
### OpenRouter Models
|
|
56
119
|
|
|
57
120
|
For vision tasks via OpenRouter, these models work reliably:
|
|
@@ -103,12 +166,12 @@ response = LlmConductor.generate(
|
|
|
103
166
|
|
|
104
167
|
### 3. Image with Detail Level
|
|
105
168
|
|
|
106
|
-
For high-resolution images, specify the detail level:
|
|
169
|
+
For high-resolution images, specify the detail level (supported by GPT and OpenRouter):
|
|
107
170
|
|
|
108
171
|
```ruby
|
|
109
172
|
response = LlmConductor.generate(
|
|
110
|
-
model: '
|
|
111
|
-
vendor: :
|
|
173
|
+
model: 'gpt-4o',
|
|
174
|
+
vendor: :openai,
|
|
112
175
|
prompt: {
|
|
113
176
|
text: 'Analyze this image in detail',
|
|
114
177
|
images: [
|
|
@@ -118,19 +181,22 @@ response = LlmConductor.generate(
|
|
|
118
181
|
)
|
|
119
182
|
```
|
|
120
183
|
|
|
121
|
-
Detail levels:
|
|
184
|
+
Detail levels (GPT and OpenRouter only):
|
|
122
185
|
- `'high'` - Better for detailed analysis (uses more tokens)
|
|
123
186
|
- `'low'` - Faster, cheaper (default if not specified)
|
|
124
187
|
- `'auto'` - Let the model decide
|
|
125
188
|
|
|
189
|
+
**Note:** Claude (Anthropic) and Z.ai don't support the `detail` parameter.
|
|
190
|
+
|
|
126
191
|
### 4. Raw Format (Advanced)
|
|
127
192
|
|
|
128
|
-
For maximum control, use
|
|
193
|
+
For maximum control, use provider-specific array formats:
|
|
129
194
|
|
|
195
|
+
**GPT/OpenRouter Format:**
|
|
130
196
|
```ruby
|
|
131
197
|
response = LlmConductor.generate(
|
|
132
|
-
model: '
|
|
133
|
-
vendor: :
|
|
198
|
+
model: 'gpt-4o',
|
|
199
|
+
vendor: :openai,
|
|
134
200
|
prompt: [
|
|
135
201
|
{ type: 'text', text: 'What is in this image?' },
|
|
136
202
|
{ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } },
|
|
@@ -139,6 +205,18 @@ response = LlmConductor.generate(
|
|
|
139
205
|
)
|
|
140
206
|
```
|
|
141
207
|
|
|
208
|
+
**Claude Format:**
|
|
209
|
+
```ruby
|
|
210
|
+
response = LlmConductor.generate(
|
|
211
|
+
model: 'claude-sonnet-4-20250514',
|
|
212
|
+
vendor: :anthropic,
|
|
213
|
+
prompt: [
|
|
214
|
+
{ type: 'image', source: { type: 'url', url: 'https://example.com/image.jpg' } },
|
|
215
|
+
{ type: 'text', text: 'What is in this image? Describe it in detail.' }
|
|
216
|
+
]
|
|
217
|
+
)
|
|
218
|
+
```
|
|
219
|
+
|
|
142
220
|
## Text-Only Requests (Backward Compatible)
|
|
143
221
|
|
|
144
222
|
The client still supports regular text-only requests:
|
|
@@ -204,6 +282,18 @@ response = LlmConductor.generate(
|
|
|
204
282
|
|
|
205
283
|
### Run Examples
|
|
206
284
|
|
|
285
|
+
For Claude:
|
|
286
|
+
```bash
|
|
287
|
+
export ANTHROPIC_API_KEY='your-key'
|
|
288
|
+
ruby examples/claude_vision_usage.rb
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
For GPT:
|
|
292
|
+
```bash
|
|
293
|
+
export OPENAI_API_KEY='your-key'
|
|
294
|
+
ruby examples/gpt_vision_usage.rb
|
|
295
|
+
```
|
|
296
|
+
|
|
207
297
|
For OpenRouter:
|
|
208
298
|
```bash
|
|
209
299
|
export OPENROUTER_API_KEY='your-key'
|
|
@@ -265,6 +355,8 @@ For production:
|
|
|
265
355
|
|
|
266
356
|
## Examples
|
|
267
357
|
|
|
358
|
+
- `examples/claude_vision_usage.rb` - Complete Claude vision examples with Claude Sonnet 4
|
|
359
|
+
- `examples/gpt_vision_usage.rb` - Complete GPT vision examples with GPT-4o
|
|
268
360
|
- `examples/openrouter_vision_usage.rb` - Complete OpenRouter vision examples
|
|
269
361
|
- `examples/zai_usage.rb` - Complete Z.ai GLM-4.5V examples including vision and text
|
|
270
362
|
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative '../lib/llm_conductor'
|
|
5
|
+
|
|
6
|
+
# This example demonstrates using Claude Sonnet 4 vision capabilities
|
|
7
|
+
# Set your Anthropic API key: export ANTHROPIC_API_KEY='your-key-here'
|
|
8
|
+
|
|
9
|
+
puts '=' * 80
|
|
10
|
+
puts 'Claude Sonnet 4 Vision Usage Examples'
|
|
11
|
+
puts '=' * 80
|
|
12
|
+
puts
|
|
13
|
+
|
|
14
|
+
# Check for API key
|
|
15
|
+
api_key = ENV['ANTHROPIC_API_KEY']
|
|
16
|
+
if api_key.nil? || api_key.empty?
|
|
17
|
+
puts 'ERROR: ANTHROPIC_API_KEY environment variable is not set!'
|
|
18
|
+
puts
|
|
19
|
+
puts 'Please set your Anthropic API key:'
|
|
20
|
+
puts ' export ANTHROPIC_API_KEY="your-key-here"'
|
|
21
|
+
puts
|
|
22
|
+
puts 'You can get an API key from: https://console.anthropic.com/'
|
|
23
|
+
exit 1
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Configure the client
|
|
27
|
+
LlmConductor.configure do |config|
|
|
28
|
+
config.anthropic(api_key:)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Example 1: Single Image Analysis
|
|
32
|
+
puts "\n1. Single Image Analysis"
|
|
33
|
+
puts '-' * 80
|
|
34
|
+
|
|
35
|
+
begin
|
|
36
|
+
response = LlmConductor.generate(
|
|
37
|
+
model: 'claude-sonnet-4-20250514',
|
|
38
|
+
vendor: :anthropic,
|
|
39
|
+
prompt: {
|
|
40
|
+
text: 'What is in this image? Please describe it in detail.',
|
|
41
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
puts "Response: #{response.output}"
|
|
46
|
+
puts "Success: #{response.success?}"
|
|
47
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
48
|
+
puts "Metadata: #{response.metadata.inspect}" if response.metadata && !response.metadata.empty?
|
|
49
|
+
rescue StandardError => e
|
|
50
|
+
puts "ERROR: #{e.message}"
|
|
51
|
+
puts "Backtrace: #{e.backtrace.first(5).join("\n")}"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Example 2: Multiple Images Comparison
|
|
55
|
+
puts "\n2. Multiple Images Comparison"
|
|
56
|
+
puts '-' * 80
|
|
57
|
+
|
|
58
|
+
response = LlmConductor.generate(
|
|
59
|
+
model: 'claude-sonnet-4-20250514',
|
|
60
|
+
vendor: :anthropic,
|
|
61
|
+
prompt: {
|
|
62
|
+
text: 'Compare these two images. What are the main differences?',
|
|
63
|
+
images: [
|
|
64
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
65
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Placeholder_view_vector.svg/1024px-Placeholder_view_vector.svg.png'
|
|
66
|
+
]
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
puts "Response: #{response.output}"
|
|
71
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
72
|
+
|
|
73
|
+
# Example 3: Image with Specific Question
|
|
74
|
+
puts "\n3. Image with Specific Question"
|
|
75
|
+
puts '-' * 80
|
|
76
|
+
|
|
77
|
+
response = LlmConductor.generate(
|
|
78
|
+
model: 'claude-sonnet-4-20250514',
|
|
79
|
+
vendor: :anthropic,
|
|
80
|
+
prompt: {
|
|
81
|
+
text: 'Is there a wooden boardwalk visible in this image? If yes, describe its condition.',
|
|
82
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
puts "Response: #{response.output}"
|
|
87
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
88
|
+
|
|
89
|
+
# Example 4: Raw Format (Advanced)
|
|
90
|
+
puts "\n4. Raw Format (Advanced)"
|
|
91
|
+
puts '-' * 80
|
|
92
|
+
|
|
93
|
+
response = LlmConductor.generate(
|
|
94
|
+
model: 'claude-sonnet-4-20250514',
|
|
95
|
+
vendor: :anthropic,
|
|
96
|
+
prompt: [
|
|
97
|
+
{ type: 'image',
|
|
98
|
+
source: { type: 'url',
|
|
99
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg' } },
|
|
100
|
+
{ type: 'text', text: 'Describe the weather conditions in this image.' }
|
|
101
|
+
]
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
puts "Response: #{response.output}"
|
|
105
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
106
|
+
|
|
107
|
+
# Example 5: Text-Only Request (Backward Compatible)
|
|
108
|
+
puts "\n5. Text-Only Request (Backward Compatible)"
|
|
109
|
+
puts '-' * 80
|
|
110
|
+
|
|
111
|
+
response = LlmConductor.generate(
|
|
112
|
+
model: 'claude-sonnet-4-20250514',
|
|
113
|
+
vendor: :anthropic,
|
|
114
|
+
prompt: 'What is the capital of France?'
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
puts "Response: #{response.output}"
|
|
118
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
119
|
+
|
|
120
|
+
# Example 6: Image Analysis with Detailed Instructions
|
|
121
|
+
puts "\n6. Image Analysis with Detailed Instructions"
|
|
122
|
+
puts '-' * 80
|
|
123
|
+
|
|
124
|
+
response = LlmConductor.generate(
|
|
125
|
+
model: 'claude-sonnet-4-20250514',
|
|
126
|
+
vendor: :anthropic,
|
|
127
|
+
prompt: {
|
|
128
|
+
text: 'Analyze this image and provide: 1) Main subjects, 2) Colors and lighting, 3) Mood or atmosphere, 4) Any notable details',
|
|
129
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
puts "Response: #{response.output}"
|
|
134
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
135
|
+
|
|
136
|
+
puts "\n#{'=' * 80}"
|
|
137
|
+
puts 'All examples completed successfully!'
|
|
138
|
+
puts '=' * 80
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative '../lib/llm_conductor'
|
|
5
|
+
|
|
6
|
+
# This example demonstrates using GPT-4o vision capabilities
|
|
7
|
+
# Set your OpenAI API key: export OPENAI_API_KEY='your-key-here'
|
|
8
|
+
|
|
9
|
+
puts '=' * 80
|
|
10
|
+
puts 'GPT-4o Vision Usage Examples'
|
|
11
|
+
puts '=' * 80
|
|
12
|
+
puts
|
|
13
|
+
|
|
14
|
+
# Check for API key
|
|
15
|
+
api_key = ENV['OPENAI_API_KEY']
|
|
16
|
+
if api_key.nil? || api_key.empty?
|
|
17
|
+
puts 'ERROR: OPENAI_API_KEY environment variable is not set!'
|
|
18
|
+
puts
|
|
19
|
+
puts 'Please set your OpenAI API key:'
|
|
20
|
+
puts ' export OPENAI_API_KEY="your-key-here"'
|
|
21
|
+
puts
|
|
22
|
+
puts 'You can get an API key from: https://platform.openai.com/api-keys'
|
|
23
|
+
exit 1
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Configure the client
|
|
27
|
+
LlmConductor.configure do |config|
|
|
28
|
+
config.openai(api_key:)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Example 1: Single Image Analysis
|
|
32
|
+
puts "\n1. Single Image Analysis"
|
|
33
|
+
puts '-' * 80
|
|
34
|
+
|
|
35
|
+
response = LlmConductor.generate(
|
|
36
|
+
model: 'gpt-4o',
|
|
37
|
+
vendor: :openai,
|
|
38
|
+
prompt: {
|
|
39
|
+
text: 'What is in this image? Please describe it in detail.',
|
|
40
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
41
|
+
}
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
puts "Response: #{response.output}"
|
|
45
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
46
|
+
|
|
47
|
+
# Example 2: Multiple Images Comparison
|
|
48
|
+
puts "\n2. Multiple Images Comparison"
|
|
49
|
+
puts '-' * 80
|
|
50
|
+
|
|
51
|
+
response = LlmConductor.generate(
|
|
52
|
+
model: 'gpt-4o',
|
|
53
|
+
vendor: :openai,
|
|
54
|
+
prompt: {
|
|
55
|
+
text: 'Compare these two images. What are the main differences?',
|
|
56
|
+
images: [
|
|
57
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
58
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Placeholder_view_vector.svg/1024px-Placeholder_view_vector.svg.png'
|
|
59
|
+
]
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
puts "Response: #{response.output}"
|
|
64
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
65
|
+
|
|
66
|
+
# Example 3: Image with Detail Level - High Resolution
|
|
67
|
+
puts "\n3. Image with Detail Level - High Resolution"
|
|
68
|
+
puts '-' * 80
|
|
69
|
+
|
|
70
|
+
response = LlmConductor.generate(
|
|
71
|
+
model: 'gpt-4o',
|
|
72
|
+
vendor: :openai,
|
|
73
|
+
prompt: {
|
|
74
|
+
text: 'Analyze this high-resolution image in detail. What are all the elements you can see?',
|
|
75
|
+
images: [
|
|
76
|
+
{ url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg', detail: 'high' }
|
|
77
|
+
]
|
|
78
|
+
}
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
puts "Response: #{response.output}"
|
|
82
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
83
|
+
|
|
84
|
+
# Example 4: Image with Detail Level - Low (Faster, Cheaper)
|
|
85
|
+
puts "\n4. Image with Detail Level - Low (Faster, Cheaper)"
|
|
86
|
+
puts '-' * 80
|
|
87
|
+
|
|
88
|
+
response = LlmConductor.generate(
|
|
89
|
+
model: 'gpt-4o',
|
|
90
|
+
vendor: :openai,
|
|
91
|
+
prompt: {
|
|
92
|
+
text: 'Give me a quick description of this image.',
|
|
93
|
+
images: [
|
|
94
|
+
{ url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg', detail: 'low' }
|
|
95
|
+
]
|
|
96
|
+
}
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
puts "Response: #{response.output}"
|
|
100
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
101
|
+
|
|
102
|
+
# Example 5: Raw Format (Advanced)
|
|
103
|
+
puts "\n5. Raw Format (Advanced)"
|
|
104
|
+
puts '-' * 80
|
|
105
|
+
|
|
106
|
+
response = LlmConductor.generate(
|
|
107
|
+
model: 'gpt-4o',
|
|
108
|
+
vendor: :openai,
|
|
109
|
+
prompt: [
|
|
110
|
+
{ type: 'text', text: 'What is in this image?' },
|
|
111
|
+
{ type: 'image_url',
|
|
112
|
+
image_url: { url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg' } },
|
|
113
|
+
{ type: 'text', text: 'Describe the weather conditions.' }
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
puts "Response: #{response.output}"
|
|
118
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
119
|
+
|
|
120
|
+
# Example 6: Text-Only Request (Backward Compatible)
|
|
121
|
+
puts "\n6. Text-Only Request (Backward Compatible)"
|
|
122
|
+
puts '-' * 80
|
|
123
|
+
|
|
124
|
+
response = LlmConductor.generate(
|
|
125
|
+
model: 'gpt-4o',
|
|
126
|
+
vendor: :openai,
|
|
127
|
+
prompt: 'What is the capital of France?'
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
puts "Response: #{response.output}"
|
|
131
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
132
|
+
|
|
133
|
+
# Example 7: Multiple Images with Mixed Detail Levels
|
|
134
|
+
puts "\n7. Multiple Images with Mixed Detail Levels"
|
|
135
|
+
puts '-' * 80
|
|
136
|
+
|
|
137
|
+
response = LlmConductor.generate(
|
|
138
|
+
model: 'gpt-4o',
|
|
139
|
+
vendor: :openai,
|
|
140
|
+
prompt: {
|
|
141
|
+
text: 'Compare these images at different detail levels.',
|
|
142
|
+
images: [
|
|
143
|
+
{
|
|
144
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg', detail: 'high'
|
|
145
|
+
},
|
|
146
|
+
{ url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Placeholder_view_vector.svg/1024px-Placeholder_view_vector.svg.png', detail: 'low' }
|
|
147
|
+
]
|
|
148
|
+
}
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
puts "Response: #{response.output}"
|
|
152
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
153
|
+
|
|
154
|
+
puts "\n#{'=' * 80}"
|
|
155
|
+
puts 'All examples completed successfully!'
|
|
156
|
+
puts '=' * 80
|
|
@@ -1,18 +1,23 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'anthropic'
|
|
4
|
+
require_relative 'concerns/vision_support'
|
|
4
5
|
|
|
5
6
|
module LlmConductor
|
|
6
7
|
module Clients
|
|
7
8
|
# Anthropic Claude client implementation for accessing Claude models via Anthropic API
|
|
9
|
+
# Supports both text-only and multimodal (vision) requests
|
|
8
10
|
class AnthropicClient < BaseClient
|
|
11
|
+
include Concerns::VisionSupport
|
|
12
|
+
|
|
9
13
|
private
|
|
10
14
|
|
|
11
15
|
def generate_content(prompt)
|
|
16
|
+
content = format_content(prompt)
|
|
12
17
|
response = client.messages.create(
|
|
13
18
|
model:,
|
|
14
19
|
max_tokens: 4096,
|
|
15
|
-
messages: [{ role: 'user', content:
|
|
20
|
+
messages: [{ role: 'user', content: }]
|
|
16
21
|
)
|
|
17
22
|
|
|
18
23
|
response.content.first.text
|
|
@@ -20,6 +25,28 @@ module LlmConductor
|
|
|
20
25
|
raise StandardError, "Anthropic API error: #{e.message}"
|
|
21
26
|
end
|
|
22
27
|
|
|
28
|
+
# Anthropic uses a different image format than OpenAI
|
|
29
|
+
# Format: { type: 'image', source: { type: 'url', url: '...' } }
|
|
30
|
+
def format_image_url(url)
|
|
31
|
+
{ type: 'image', source: { type: 'url', url: } }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def format_image_hash(image_hash)
|
|
35
|
+
# Anthropic doesn't have a 'detail' parameter like OpenAI
|
|
36
|
+
{
|
|
37
|
+
type: 'image',
|
|
38
|
+
source: {
|
|
39
|
+
type: 'url',
|
|
40
|
+
url: image_hash[:url] || image_hash['url']
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Anthropic recommends placing images before text
|
|
46
|
+
def images_before_text?
|
|
47
|
+
true
|
|
48
|
+
end
|
|
49
|
+
|
|
23
50
|
def client
|
|
24
51
|
@client ||= begin
|
|
25
52
|
config = LlmConductor.configuration.provider_config(:anthropic)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmConductor
|
|
4
|
+
module Clients
|
|
5
|
+
module Concerns
|
|
6
|
+
# Shared module for vision/multimodal support across different LLM clients
|
|
7
|
+
# Provides common functionality for formatting images and text content
|
|
8
|
+
module VisionSupport
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
# Override token calculation to handle multimodal content
|
|
12
|
+
def calculate_tokens(content)
|
|
13
|
+
case content
|
|
14
|
+
when String then super(content)
|
|
15
|
+
when Hash then calculate_tokens_from_hash(content)
|
|
16
|
+
when Array then calculate_tokens_from_array(content)
|
|
17
|
+
else super(content.to_s)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Calculate tokens from a hash containing text and/or images
|
|
22
|
+
# @param content_hash [Hash] Hash with :text and/or :images keys
|
|
23
|
+
# @return [Integer] Token count for text portion
|
|
24
|
+
def calculate_tokens_from_hash(content_hash)
|
|
25
|
+
text = content_hash[:text] || content_hash['text'] || ''
|
|
26
|
+
# Call the parent class's calculate_tokens with the extracted text
|
|
27
|
+
method(:calculate_tokens).super_method.call(text)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Calculate tokens from an array of content parts
|
|
31
|
+
# @param content_array [Array] Array of content parts with type and text
|
|
32
|
+
# @return [Integer] Token count for all text parts
|
|
33
|
+
def calculate_tokens_from_array(content_array)
|
|
34
|
+
text_parts = extract_text_from_array(content_array)
|
|
35
|
+
# Call the parent class's calculate_tokens with the joined text
|
|
36
|
+
method(:calculate_tokens).super_method.call(text_parts)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Extract and join text from array of content parts
|
|
40
|
+
# @param content_array [Array] Array of content parts
|
|
41
|
+
# @return [String] Joined text from all text parts
|
|
42
|
+
def extract_text_from_array(content_array)
|
|
43
|
+
content_array
|
|
44
|
+
.select { |part| text_part?(part) }
|
|
45
|
+
.map { |part| extract_text_from_part(part) }
|
|
46
|
+
.join(' ')
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Check if a content part is a text part
|
|
50
|
+
# @param part [Hash] Content part
|
|
51
|
+
# @return [Boolean] true if part is a text type
|
|
52
|
+
def text_part?(part)
|
|
53
|
+
part[:type] == 'text' || part['type'] == 'text'
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Extract text from a content part
|
|
57
|
+
# @param part [Hash] Content part with text
|
|
58
|
+
# @return [String] Text content
|
|
59
|
+
def extract_text_from_part(part)
|
|
60
|
+
part[:text] || part['text'] || ''
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Format content based on whether it's a simple string or multimodal content
|
|
64
|
+
# @param prompt [String, Hash, Array] The prompt content
|
|
65
|
+
# @return [String, Array] Formatted content for the API
|
|
66
|
+
def format_content(prompt)
|
|
67
|
+
case prompt
|
|
68
|
+
when Hash
|
|
69
|
+
# Handle hash with text and/or images
|
|
70
|
+
format_multimodal_hash(prompt)
|
|
71
|
+
when Array
|
|
72
|
+
# Already formatted as array of content parts
|
|
73
|
+
prompt
|
|
74
|
+
else
|
|
75
|
+
# Simple string prompt
|
|
76
|
+
prompt.to_s
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Format a hash containing text and/or images into multimodal content array
|
|
81
|
+
# @param prompt_hash [Hash] Hash with :text and/or :images keys
|
|
82
|
+
# @return [Array] Array of content parts for the API
|
|
83
|
+
def format_multimodal_hash(prompt_hash)
|
|
84
|
+
content_parts = []
|
|
85
|
+
|
|
86
|
+
# Add image parts (order depends on provider)
|
|
87
|
+
images = prompt_hash[:images] || prompt_hash['images'] || []
|
|
88
|
+
images = [images] unless images.is_a?(Array)
|
|
89
|
+
|
|
90
|
+
if images_before_text?
|
|
91
|
+
# Anthropic recommends images before text
|
|
92
|
+
images.each { |image| content_parts << format_image_part(image) }
|
|
93
|
+
add_text_part(content_parts, prompt_hash)
|
|
94
|
+
else
|
|
95
|
+
# OpenAI/most others: text before images
|
|
96
|
+
add_text_part(content_parts, prompt_hash)
|
|
97
|
+
images.each { |image| content_parts << format_image_part(image) }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
content_parts
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Add text part to content array if present
|
|
104
|
+
# @param content_parts [Array] The content parts array
|
|
105
|
+
# @param prompt_hash [Hash] Hash with :text key
|
|
106
|
+
def add_text_part(content_parts, prompt_hash)
|
|
107
|
+
return unless prompt_hash[:text] || prompt_hash['text']
|
|
108
|
+
|
|
109
|
+
text = prompt_hash[:text] || prompt_hash['text']
|
|
110
|
+
content_parts << { type: 'text', text: }
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Format an image into the appropriate API structure
|
|
114
|
+
# This method should be overridden by clients that need different formats
|
|
115
|
+
# @param image [String, Hash] Image URL or hash with url/detail keys
|
|
116
|
+
# @return [Hash] Formatted image part for the API
|
|
117
|
+
def format_image_part(image)
|
|
118
|
+
case image
|
|
119
|
+
when String
|
|
120
|
+
format_image_url(image)
|
|
121
|
+
when Hash
|
|
122
|
+
format_image_hash(image)
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Format a simple image URL string
|
|
127
|
+
# Override this in subclasses for provider-specific format
|
|
128
|
+
# @param url [String] Image URL
|
|
129
|
+
# @return [Hash] Formatted image part
|
|
130
|
+
def format_image_url(url)
|
|
131
|
+
# Default: OpenAI format
|
|
132
|
+
{ type: 'image_url', image_url: { url: } }
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Format an image hash with url and optional detail
|
|
136
|
+
# Override this in subclasses for provider-specific format
|
|
137
|
+
# @param image_hash [Hash] Hash with url and optional detail keys
|
|
138
|
+
# @return [Hash] Formatted image part
|
|
139
|
+
def format_image_hash(image_hash)
|
|
140
|
+
# Default: OpenAI format with detail support
|
|
141
|
+
{
|
|
142
|
+
type: 'image_url',
|
|
143
|
+
image_url: {
|
|
144
|
+
url: image_hash[:url] || image_hash['url'],
|
|
145
|
+
detail: image_hash[:detail] || image_hash['detail']
|
|
146
|
+
}.compact
|
|
147
|
+
}
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Whether to place images before text in the content array
|
|
151
|
+
# Override this in subclasses if needed (e.g., Anthropic recommends images first)
|
|
152
|
+
# @return [Boolean] true if images should come before text
|
|
153
|
+
def images_before_text?
|
|
154
|
+
false
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'concerns/vision_support'
|
|
4
|
+
|
|
3
5
|
module LlmConductor
|
|
4
6
|
module Clients
|
|
5
7
|
# OpenAI GPT client implementation for accessing GPT models via OpenAI API
|
|
8
|
+
# Supports both text-only and multimodal (vision) requests
|
|
6
9
|
class GptClient < BaseClient
|
|
10
|
+
include Concerns::VisionSupport
|
|
11
|
+
|
|
7
12
|
private
|
|
8
13
|
|
|
9
14
|
def generate_content(prompt)
|
|
10
|
-
|
|
15
|
+
content = format_content(prompt)
|
|
16
|
+
client.chat(parameters: { model:, messages: [{ role: 'user', content: }] })
|
|
11
17
|
.dig('choices', 0, 'message', 'content')
|
|
12
18
|
end
|
|
13
19
|
|
|
@@ -1,32 +1,15 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'concerns/vision_support'
|
|
4
|
+
|
|
3
5
|
module LlmConductor
|
|
4
6
|
module Clients
|
|
5
7
|
# OpenRouter client implementation for accessing various LLM providers through OpenRouter API
|
|
6
8
|
# Supports both text-only and multimodal (vision) requests
|
|
7
9
|
class OpenrouterClient < BaseClient
|
|
8
|
-
|
|
10
|
+
include Concerns::VisionSupport
|
|
9
11
|
|
|
10
|
-
|
|
11
|
-
def calculate_tokens(content)
|
|
12
|
-
case content
|
|
13
|
-
when String
|
|
14
|
-
super(content)
|
|
15
|
-
when Hash
|
|
16
|
-
# For multimodal content, count tokens only for text part
|
|
17
|
-
# Note: This is an approximation as images have variable token counts
|
|
18
|
-
text = content[:text] || content['text'] || ''
|
|
19
|
-
super(text)
|
|
20
|
-
when Array
|
|
21
|
-
# For pre-formatted arrays, extract and count text parts
|
|
22
|
-
text_parts = content.select { |part| part[:type] == 'text' || part['type'] == 'text' }
|
|
23
|
-
.map { |part| part[:text] || part['text'] || '' }
|
|
24
|
-
.join(' ')
|
|
25
|
-
super(text_parts)
|
|
26
|
-
else
|
|
27
|
-
super(content.to_s)
|
|
28
|
-
end
|
|
29
|
-
end
|
|
12
|
+
private
|
|
30
13
|
|
|
31
14
|
def generate_content(prompt)
|
|
32
15
|
content = format_content(prompt)
|
|
@@ -61,66 +44,6 @@ module LlmConductor
|
|
|
61
44
|
end
|
|
62
45
|
end
|
|
63
46
|
|
|
64
|
-
# Format content based on whether it's a simple string or multimodal content
|
|
65
|
-
# @param prompt [String, Hash, Array] The prompt content
|
|
66
|
-
# @return [String, Array] Formatted content for the API
|
|
67
|
-
def format_content(prompt)
|
|
68
|
-
case prompt
|
|
69
|
-
when Hash
|
|
70
|
-
# Handle hash with text and/or images
|
|
71
|
-
format_multimodal_hash(prompt)
|
|
72
|
-
when Array
|
|
73
|
-
# Already formatted as array of content parts
|
|
74
|
-
prompt
|
|
75
|
-
else
|
|
76
|
-
# Simple string prompt
|
|
77
|
-
prompt.to_s
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
# Format a hash containing text and/or images into multimodal content array
|
|
82
|
-
# @param prompt_hash [Hash] Hash with :text and/or :images keys
|
|
83
|
-
# @return [Array] Array of content parts for the API
|
|
84
|
-
def format_multimodal_hash(prompt_hash)
|
|
85
|
-
content_parts = []
|
|
86
|
-
|
|
87
|
-
# Add text part if present
|
|
88
|
-
if prompt_hash[:text] || prompt_hash['text']
|
|
89
|
-
text = prompt_hash[:text] || prompt_hash['text']
|
|
90
|
-
content_parts << { type: 'text', text: }
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# Add image parts if present
|
|
94
|
-
images = prompt_hash[:images] || prompt_hash['images'] || []
|
|
95
|
-
images = [images] unless images.is_a?(Array)
|
|
96
|
-
|
|
97
|
-
images.each do |image|
|
|
98
|
-
content_parts << format_image_part(image)
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
content_parts
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# Format an image into the appropriate API structure
|
|
105
|
-
# @param image [String, Hash] Image URL or hash with url/detail keys
|
|
106
|
-
# @return [Hash] Formatted image part for the API
|
|
107
|
-
def format_image_part(image)
|
|
108
|
-
case image
|
|
109
|
-
when String
|
|
110
|
-
# Simple URL string
|
|
111
|
-
{ type: 'image_url', image_url: { url: image } }
|
|
112
|
-
when Hash
|
|
113
|
-
# Hash with url and optional detail level
|
|
114
|
-
{
|
|
115
|
-
type: 'image_url',
|
|
116
|
-
image_url: {
|
|
117
|
-
url: image[:url] || image['url'],
|
|
118
|
-
detail: image[:detail] || image['detail']
|
|
119
|
-
}.compact
|
|
120
|
-
}
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
|
|
124
47
|
def client
|
|
125
48
|
@client ||= begin
|
|
126
49
|
config = LlmConductor.configuration.provider_config(:openrouter)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'concerns/vision_support'
|
|
4
|
+
|
|
3
5
|
module LlmConductor
|
|
4
6
|
module Clients
|
|
5
7
|
# Z.ai client implementation for accessing GLM models including GLM-4.5V
|
|
@@ -8,28 +10,9 @@ module LlmConductor
|
|
|
8
10
|
# Note: Z.ai uses OpenAI-compatible API format but with /v4/ path instead of /v1/
|
|
9
11
|
# We use Faraday directly instead of the ruby-openai gem to properly handle the API path
|
|
10
12
|
class ZaiClient < BaseClient
|
|
11
|
-
|
|
13
|
+
include Concerns::VisionSupport
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
def calculate_tokens(content)
|
|
15
|
-
case content
|
|
16
|
-
when String
|
|
17
|
-
super(content)
|
|
18
|
-
when Hash
|
|
19
|
-
# For multimodal content, count tokens only for text part
|
|
20
|
-
# Note: This is an approximation as images have variable token counts
|
|
21
|
-
text = content[:text] || content['text'] || ''
|
|
22
|
-
super(text)
|
|
23
|
-
when Array
|
|
24
|
-
# For pre-formatted arrays, extract and count text parts
|
|
25
|
-
text_parts = content.select { |part| part[:type] == 'text' || part['type'] == 'text' }
|
|
26
|
-
.map { |part| part[:text] || part['text'] || '' }
|
|
27
|
-
.join(' ')
|
|
28
|
-
super(text_parts)
|
|
29
|
-
else
|
|
30
|
-
super(content.to_s)
|
|
31
|
-
end
|
|
32
|
-
end
|
|
15
|
+
private
|
|
33
16
|
|
|
34
17
|
def generate_content(prompt)
|
|
35
18
|
content = format_content(prompt)
|
|
@@ -67,66 +50,6 @@ module LlmConductor
|
|
|
67
50
|
end
|
|
68
51
|
end
|
|
69
52
|
|
|
70
|
-
# Format content based on whether it's a simple string or multimodal content
|
|
71
|
-
# @param prompt [String, Hash, Array] The prompt content
|
|
72
|
-
# @return [String, Array] Formatted content for the API
|
|
73
|
-
def format_content(prompt)
|
|
74
|
-
case prompt
|
|
75
|
-
when Hash
|
|
76
|
-
# Handle hash with text and/or images
|
|
77
|
-
format_multimodal_hash(prompt)
|
|
78
|
-
when Array
|
|
79
|
-
# Already formatted as array of content parts
|
|
80
|
-
prompt
|
|
81
|
-
else
|
|
82
|
-
# Simple string prompt
|
|
83
|
-
prompt.to_s
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# Format a hash containing text and/or images into multimodal content array
|
|
88
|
-
# @param prompt_hash [Hash] Hash with :text and/or :images keys
|
|
89
|
-
# @return [Array] Array of content parts for the API
|
|
90
|
-
def format_multimodal_hash(prompt_hash)
|
|
91
|
-
content_parts = []
|
|
92
|
-
|
|
93
|
-
# Add text part if present
|
|
94
|
-
if prompt_hash[:text] || prompt_hash['text']
|
|
95
|
-
text = prompt_hash[:text] || prompt_hash['text']
|
|
96
|
-
content_parts << { type: 'text', text: }
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
# Add image parts if present
|
|
100
|
-
images = prompt_hash[:images] || prompt_hash['images'] || []
|
|
101
|
-
images = [images] unless images.is_a?(Array)
|
|
102
|
-
|
|
103
|
-
images.each do |image|
|
|
104
|
-
content_parts << format_image_part(image)
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
content_parts
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
# Format an image into the appropriate API structure
|
|
111
|
-
# @param image [String, Hash] Image URL or hash with url/detail keys
|
|
112
|
-
# @return [Hash] Formatted image part for the API
|
|
113
|
-
def format_image_part(image)
|
|
114
|
-
case image
|
|
115
|
-
when String
|
|
116
|
-
# Simple URL string or base64 data
|
|
117
|
-
{ type: 'image_url', image_url: { url: image } }
|
|
118
|
-
when Hash
|
|
119
|
-
# Hash with url and optional detail level
|
|
120
|
-
{
|
|
121
|
-
type: 'image_url',
|
|
122
|
-
image_url: {
|
|
123
|
-
url: image[:url] || image['url'],
|
|
124
|
-
detail: image[:detail] || image['detail']
|
|
125
|
-
}.compact
|
|
126
|
-
}
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
|
|
130
53
|
# HTTP client for making requests to Z.ai API
|
|
131
54
|
# Z.ai uses /v4/ in their path, not /v1/ like OpenAI, so we use Faraday directly
|
|
132
55
|
def http_client
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llm_conductor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ben Zheng
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2025-
|
|
10
|
+
date: 2025-11-04 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: activesupport
|
|
@@ -154,8 +154,10 @@ files:
|
|
|
154
154
|
- Rakefile
|
|
155
155
|
- VISION_USAGE.md
|
|
156
156
|
- config/initializers/llm_conductor.rb
|
|
157
|
+
- examples/claude_vision_usage.rb
|
|
157
158
|
- examples/data_builder_usage.rb
|
|
158
159
|
- examples/gemini_usage.rb
|
|
160
|
+
- examples/gpt_vision_usage.rb
|
|
159
161
|
- examples/groq_usage.rb
|
|
160
162
|
- examples/openrouter_vision_usage.rb
|
|
161
163
|
- examples/prompt_registration.rb
|
|
@@ -166,6 +168,7 @@ files:
|
|
|
166
168
|
- lib/llm_conductor/client_factory.rb
|
|
167
169
|
- lib/llm_conductor/clients/anthropic_client.rb
|
|
168
170
|
- lib/llm_conductor/clients/base_client.rb
|
|
171
|
+
- lib/llm_conductor/clients/concerns/vision_support.rb
|
|
169
172
|
- lib/llm_conductor/clients/gemini_client.rb
|
|
170
173
|
- lib/llm_conductor/clients/gpt_client.rb
|
|
171
174
|
- lib/llm_conductor/clients/groq_client.rb
|