llm_conductor 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +11 -1
- data/README.md +87 -3
- data/VISION_USAGE.md +146 -9
- data/examples/claude_vision_usage.rb +138 -0
- data/examples/gpt_vision_usage.rb +156 -0
- data/examples/zai_usage.rb +163 -0
- data/lib/llm_conductor/client_factory.rb +4 -1
- data/lib/llm_conductor/clients/anthropic_client.rb +28 -1
- data/lib/llm_conductor/clients/concerns/vision_support.rb +159 -0
- data/lib/llm_conductor/clients/gpt_client.rb +7 -1
- data/lib/llm_conductor/clients/openrouter_client.rb +4 -81
- data/lib/llm_conductor/clients/zai_client.rb +76 -0
- data/lib/llm_conductor/configuration.rb +17 -0
- data/lib/llm_conductor/prompt_manager.rb +1 -3
- data/lib/llm_conductor/version.rb +1 -1
- data/lib/llm_conductor.rb +5 -3
- metadata +7 -2
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative '../lib/llm_conductor'
|
|
5
|
+
|
|
6
|
+
# This example demonstrates using GPT-4o vision capabilities
|
|
7
|
+
# Set your OpenAI API key: export OPENAI_API_KEY='your-key-here'
|
|
8
|
+
|
|
9
|
+
puts '=' * 80
|
|
10
|
+
puts 'GPT-4o Vision Usage Examples'
|
|
11
|
+
puts '=' * 80
|
|
12
|
+
puts
|
|
13
|
+
|
|
14
|
+
# Check for API key
|
|
15
|
+
api_key = ENV['OPENAI_API_KEY']
|
|
16
|
+
if api_key.nil? || api_key.empty?
|
|
17
|
+
puts 'ERROR: OPENAI_API_KEY environment variable is not set!'
|
|
18
|
+
puts
|
|
19
|
+
puts 'Please set your OpenAI API key:'
|
|
20
|
+
puts ' export OPENAI_API_KEY="your-key-here"'
|
|
21
|
+
puts
|
|
22
|
+
puts 'You can get an API key from: https://platform.openai.com/api-keys'
|
|
23
|
+
exit 1
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Configure the client
|
|
27
|
+
LlmConductor.configure do |config|
|
|
28
|
+
config.openai(api_key:)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Example 1: Single Image Analysis
|
|
32
|
+
puts "\n1. Single Image Analysis"
|
|
33
|
+
puts '-' * 80
|
|
34
|
+
|
|
35
|
+
response = LlmConductor.generate(
|
|
36
|
+
model: 'gpt-4o',
|
|
37
|
+
vendor: :openai,
|
|
38
|
+
prompt: {
|
|
39
|
+
text: 'What is in this image? Please describe it in detail.',
|
|
40
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
41
|
+
}
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
puts "Response: #{response.output}"
|
|
45
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
46
|
+
|
|
47
|
+
# Example 2: Multiple Images Comparison
|
|
48
|
+
puts "\n2. Multiple Images Comparison"
|
|
49
|
+
puts '-' * 80
|
|
50
|
+
|
|
51
|
+
response = LlmConductor.generate(
|
|
52
|
+
model: 'gpt-4o',
|
|
53
|
+
vendor: :openai,
|
|
54
|
+
prompt: {
|
|
55
|
+
text: 'Compare these two images. What are the main differences?',
|
|
56
|
+
images: [
|
|
57
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
58
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Placeholder_view_vector.svg/1024px-Placeholder_view_vector.svg.png'
|
|
59
|
+
]
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
puts "Response: #{response.output}"
|
|
64
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
65
|
+
|
|
66
|
+
# Example 3: Image with Detail Level - High Resolution
|
|
67
|
+
puts "\n3. Image with Detail Level - High Resolution"
|
|
68
|
+
puts '-' * 80
|
|
69
|
+
|
|
70
|
+
response = LlmConductor.generate(
|
|
71
|
+
model: 'gpt-4o',
|
|
72
|
+
vendor: :openai,
|
|
73
|
+
prompt: {
|
|
74
|
+
text: 'Analyze this high-resolution image in detail. What are all the elements you can see?',
|
|
75
|
+
images: [
|
|
76
|
+
{ url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg', detail: 'high' }
|
|
77
|
+
]
|
|
78
|
+
}
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
puts "Response: #{response.output}"
|
|
82
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
83
|
+
|
|
84
|
+
# Example 4: Image with Detail Level - Low (Faster, Cheaper)
|
|
85
|
+
puts "\n4. Image with Detail Level - Low (Faster, Cheaper)"
|
|
86
|
+
puts '-' * 80
|
|
87
|
+
|
|
88
|
+
response = LlmConductor.generate(
|
|
89
|
+
model: 'gpt-4o',
|
|
90
|
+
vendor: :openai,
|
|
91
|
+
prompt: {
|
|
92
|
+
text: 'Give me a quick description of this image.',
|
|
93
|
+
images: [
|
|
94
|
+
{ url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg', detail: 'low' }
|
|
95
|
+
]
|
|
96
|
+
}
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
puts "Response: #{response.output}"
|
|
100
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
101
|
+
|
|
102
|
+
# Example 5: Raw Format (Advanced)
|
|
103
|
+
puts "\n5. Raw Format (Advanced)"
|
|
104
|
+
puts '-' * 80
|
|
105
|
+
|
|
106
|
+
response = LlmConductor.generate(
|
|
107
|
+
model: 'gpt-4o',
|
|
108
|
+
vendor: :openai,
|
|
109
|
+
prompt: [
|
|
110
|
+
{ type: 'text', text: 'What is in this image?' },
|
|
111
|
+
{ type: 'image_url',
|
|
112
|
+
image_url: { url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg' } },
|
|
113
|
+
{ type: 'text', text: 'Describe the weather conditions.' }
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
puts "Response: #{response.output}"
|
|
118
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
119
|
+
|
|
120
|
+
# Example 6: Text-Only Request (Backward Compatible)
|
|
121
|
+
puts "\n6. Text-Only Request (Backward Compatible)"
|
|
122
|
+
puts '-' * 80
|
|
123
|
+
|
|
124
|
+
response = LlmConductor.generate(
|
|
125
|
+
model: 'gpt-4o',
|
|
126
|
+
vendor: :openai,
|
|
127
|
+
prompt: 'What is the capital of France?'
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
puts "Response: #{response.output}"
|
|
131
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
132
|
+
|
|
133
|
+
# Example 7: Multiple Images with Mixed Detail Levels
|
|
134
|
+
puts "\n7. Multiple Images with Mixed Detail Levels"
|
|
135
|
+
puts '-' * 80
|
|
136
|
+
|
|
137
|
+
response = LlmConductor.generate(
|
|
138
|
+
model: 'gpt-4o',
|
|
139
|
+
vendor: :openai,
|
|
140
|
+
prompt: {
|
|
141
|
+
text: 'Compare these images at different detail levels.',
|
|
142
|
+
images: [
|
|
143
|
+
{
|
|
144
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1024px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg', detail: 'high'
|
|
145
|
+
},
|
|
146
|
+
{ url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Placeholder_view_vector.svg/1024px-Placeholder_view_vector.svg.png', detail: 'low' }
|
|
147
|
+
]
|
|
148
|
+
}
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
puts "Response: #{response.output}"
|
|
152
|
+
puts "Tokens: #{response.input_tokens} input, #{response.output_tokens} output"
|
|
153
|
+
|
|
154
|
+
puts "\n#{'=' * 80}"
|
|
155
|
+
puts 'All examples completed successfully!'
|
|
156
|
+
puts '=' * 80
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example of Z.ai GLM model usage including multimodal/vision capabilities
|
|
5
|
+
require_relative '../lib/llm_conductor'
|
|
6
|
+
|
|
7
|
+
# Configure Z.ai
|
|
8
|
+
LlmConductor.configure do |config|
|
|
9
|
+
config.zai(
|
|
10
|
+
api_key: ENV['ZAI_API_KEY']
|
|
11
|
+
)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Example 1: Simple text-only request with GLM-4-plus
|
|
15
|
+
puts '=== Example 1: Text-only request with GLM-4-plus ==='
|
|
16
|
+
response = LlmConductor.generate(
|
|
17
|
+
model: 'glm-4-plus',
|
|
18
|
+
vendor: :zai,
|
|
19
|
+
prompt: 'What is the capital of France? Please answer in one sentence.'
|
|
20
|
+
)
|
|
21
|
+
puts response.output
|
|
22
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
23
|
+
|
|
24
|
+
# Example 2: Text request with GLM-4.5V (vision model, text-only mode)
|
|
25
|
+
puts '=== Example 2: Text-only request with GLM-4.5V ==='
|
|
26
|
+
response = LlmConductor.generate(
|
|
27
|
+
model: 'glm-4.5v',
|
|
28
|
+
vendor: :zai,
|
|
29
|
+
prompt: 'Explain the concept of machine learning in simple terms.'
|
|
30
|
+
)
|
|
31
|
+
puts response.output
|
|
32
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
33
|
+
|
|
34
|
+
# Example 3: Vision request with a single image
|
|
35
|
+
puts '=== Example 3: Single image analysis with GLM-4.5V ==='
|
|
36
|
+
response = LlmConductor.generate(
|
|
37
|
+
model: 'glm-4.5v',
|
|
38
|
+
vendor: :zai,
|
|
39
|
+
prompt: {
|
|
40
|
+
text: 'What do you see in this image? Please describe it in detail.',
|
|
41
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
puts response.output
|
|
45
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
46
|
+
|
|
47
|
+
# Example 4: Vision request with multiple images
|
|
48
|
+
puts '=== Example 4: Multiple images comparison with GLM-4.5V ==='
|
|
49
|
+
response = LlmConductor.generate(
|
|
50
|
+
model: 'glm-4.5v',
|
|
51
|
+
vendor: :zai,
|
|
52
|
+
prompt: {
|
|
53
|
+
text: 'Compare these two images and describe the differences you observe.',
|
|
54
|
+
images: [
|
|
55
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
56
|
+
'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Placeholder_view_vector.svg/681px-Placeholder_view_vector.svg.png'
|
|
57
|
+
]
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
puts response.output
|
|
61
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
62
|
+
|
|
63
|
+
# Example 5: Image with detail level specification
|
|
64
|
+
puts '=== Example 5: Image with detail level ==='
|
|
65
|
+
response = LlmConductor.generate(
|
|
66
|
+
model: 'glm-4.5v',
|
|
67
|
+
vendor: :zai,
|
|
68
|
+
prompt: {
|
|
69
|
+
text: 'Describe this image in detail, including colors, objects, and atmosphere.',
|
|
70
|
+
images: [
|
|
71
|
+
{
|
|
72
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg',
|
|
73
|
+
detail: 'high'
|
|
74
|
+
}
|
|
75
|
+
]
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
puts response.output
|
|
79
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
80
|
+
|
|
81
|
+
# Example 6: Using raw array format (advanced)
|
|
82
|
+
puts '=== Example 6: Raw array format ==='
|
|
83
|
+
response = LlmConductor.generate(
|
|
84
|
+
model: 'glm-4.5v',
|
|
85
|
+
vendor: :zai,
|
|
86
|
+
prompt: [
|
|
87
|
+
{ type: 'text', text: 'What objects can you identify in this image?' },
|
|
88
|
+
{
|
|
89
|
+
type: 'image_url',
|
|
90
|
+
image_url: {
|
|
91
|
+
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
)
|
|
96
|
+
puts response.output
|
|
97
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
98
|
+
|
|
99
|
+
# Example 7: Base64 encoded image (for local images)
|
|
100
|
+
puts '=== Example 7: Using base64 encoded image ==='
|
|
101
|
+
# NOTE: In real usage, you would read and encode a local file
|
|
102
|
+
# require 'base64'
|
|
103
|
+
# image_data = Base64.strict_encode64(File.read('path/to/image.jpg'))
|
|
104
|
+
# image_url = "data:image/jpeg;base64,#{image_data}"
|
|
105
|
+
|
|
106
|
+
# For this example, we'll use a URL
|
|
107
|
+
response = LlmConductor.generate(
|
|
108
|
+
model: 'glm-4.5v',
|
|
109
|
+
vendor: :zai,
|
|
110
|
+
prompt: {
|
|
111
|
+
text: 'Analyze this image and extract any text you can see.',
|
|
112
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
puts response.output
|
|
116
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
117
|
+
|
|
118
|
+
# Example 8: Error handling
|
|
119
|
+
puts '=== Example 8: Error handling ==='
|
|
120
|
+
begin
|
|
121
|
+
response = LlmConductor.generate(
|
|
122
|
+
model: 'glm-4.5v',
|
|
123
|
+
vendor: :zai,
|
|
124
|
+
prompt: {
|
|
125
|
+
text: 'Analyze this image',
|
|
126
|
+
images: 'invalid-url'
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if response.success?
|
|
131
|
+
puts response.output
|
|
132
|
+
else
|
|
133
|
+
puts "Error: #{response.metadata[:error]}"
|
|
134
|
+
end
|
|
135
|
+
rescue StandardError => e
|
|
136
|
+
puts "Exception: #{e.message}"
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Example 9: Document understanding (OCR)
|
|
140
|
+
puts "\n=== Example 9: Document understanding ==="
|
|
141
|
+
response = LlmConductor.generate(
|
|
142
|
+
model: 'glm-4.5v',
|
|
143
|
+
vendor: :zai,
|
|
144
|
+
prompt: {
|
|
145
|
+
text: 'Please read any text visible in this image and transcribe it.',
|
|
146
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
puts response.output
|
|
150
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
151
|
+
|
|
152
|
+
# Example 10: Complex reasoning with image
|
|
153
|
+
puts '=== Example 10: Complex reasoning with image ==='
|
|
154
|
+
response = LlmConductor.generate(
|
|
155
|
+
model: 'glm-4.5v',
|
|
156
|
+
vendor: :zai,
|
|
157
|
+
prompt: {
|
|
158
|
+
text: 'Based on this image, what time of day do you think it is? Explain your reasoning.',
|
|
159
|
+
images: 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg'
|
|
160
|
+
}
|
|
161
|
+
)
|
|
162
|
+
puts response.output
|
|
163
|
+
puts "Tokens used: #{response.total_tokens}\n\n"
|
|
@@ -19,7 +19,8 @@ module LlmConductor
|
|
|
19
19
|
ollama: Clients::OllamaClient,
|
|
20
20
|
gemini: Clients::GeminiClient,
|
|
21
21
|
google: Clients::GeminiClient,
|
|
22
|
-
groq: Clients::GroqClient
|
|
22
|
+
groq: Clients::GroqClient,
|
|
23
|
+
zai: Clients::ZaiClient
|
|
23
24
|
}
|
|
24
25
|
|
|
25
26
|
client_classes.fetch(vendor) do
|
|
@@ -35,6 +36,8 @@ module LlmConductor
|
|
|
35
36
|
:openai
|
|
36
37
|
when /^gemini/i
|
|
37
38
|
:gemini
|
|
39
|
+
when /^glm/i
|
|
40
|
+
:zai
|
|
38
41
|
when /^(llama|mixtral|gemma|qwen)/i
|
|
39
42
|
:groq
|
|
40
43
|
else
|
|
@@ -1,18 +1,23 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'anthropic'
|
|
4
|
+
require_relative 'concerns/vision_support'
|
|
4
5
|
|
|
5
6
|
module LlmConductor
|
|
6
7
|
module Clients
|
|
7
8
|
# Anthropic Claude client implementation for accessing Claude models via Anthropic API
|
|
9
|
+
# Supports both text-only and multimodal (vision) requests
|
|
8
10
|
class AnthropicClient < BaseClient
|
|
11
|
+
include Concerns::VisionSupport
|
|
12
|
+
|
|
9
13
|
private
|
|
10
14
|
|
|
11
15
|
def generate_content(prompt)
|
|
16
|
+
content = format_content(prompt)
|
|
12
17
|
response = client.messages.create(
|
|
13
18
|
model:,
|
|
14
19
|
max_tokens: 4096,
|
|
15
|
-
messages: [{ role: 'user', content:
|
|
20
|
+
messages: [{ role: 'user', content: }]
|
|
16
21
|
)
|
|
17
22
|
|
|
18
23
|
response.content.first.text
|
|
@@ -20,6 +25,28 @@ module LlmConductor
|
|
|
20
25
|
raise StandardError, "Anthropic API error: #{e.message}"
|
|
21
26
|
end
|
|
22
27
|
|
|
28
|
+
# Anthropic uses a different image format than OpenAI
|
|
29
|
+
# Format: { type: 'image', source: { type: 'url', url: '...' } }
|
|
30
|
+
def format_image_url(url)
|
|
31
|
+
{ type: 'image', source: { type: 'url', url: } }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def format_image_hash(image_hash)
|
|
35
|
+
# Anthropic doesn't have a 'detail' parameter like OpenAI
|
|
36
|
+
{
|
|
37
|
+
type: 'image',
|
|
38
|
+
source: {
|
|
39
|
+
type: 'url',
|
|
40
|
+
url: image_hash[:url] || image_hash['url']
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Anthropic recommends placing images before text
|
|
46
|
+
def images_before_text?
|
|
47
|
+
true
|
|
48
|
+
end
|
|
49
|
+
|
|
23
50
|
def client
|
|
24
51
|
@client ||= begin
|
|
25
52
|
config = LlmConductor.configuration.provider_config(:anthropic)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmConductor
|
|
4
|
+
module Clients
|
|
5
|
+
module Concerns
|
|
6
|
+
# Shared module for vision/multimodal support across different LLM clients
|
|
7
|
+
# Provides common functionality for formatting images and text content
|
|
8
|
+
module VisionSupport
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
# Override token calculation to handle multimodal content
|
|
12
|
+
def calculate_tokens(content)
|
|
13
|
+
case content
|
|
14
|
+
when String then super(content)
|
|
15
|
+
when Hash then calculate_tokens_from_hash(content)
|
|
16
|
+
when Array then calculate_tokens_from_array(content)
|
|
17
|
+
else super(content.to_s)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Calculate tokens from a hash containing text and/or images
|
|
22
|
+
# @param content_hash [Hash] Hash with :text and/or :images keys
|
|
23
|
+
# @return [Integer] Token count for text portion
|
|
24
|
+
def calculate_tokens_from_hash(content_hash)
|
|
25
|
+
text = content_hash[:text] || content_hash['text'] || ''
|
|
26
|
+
# Call the parent class's calculate_tokens with the extracted text
|
|
27
|
+
method(:calculate_tokens).super_method.call(text)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Calculate tokens from an array of content parts
|
|
31
|
+
# @param content_array [Array] Array of content parts with type and text
|
|
32
|
+
# @return [Integer] Token count for all text parts
|
|
33
|
+
def calculate_tokens_from_array(content_array)
|
|
34
|
+
text_parts = extract_text_from_array(content_array)
|
|
35
|
+
# Call the parent class's calculate_tokens with the joined text
|
|
36
|
+
method(:calculate_tokens).super_method.call(text_parts)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Extract and join text from array of content parts
|
|
40
|
+
# @param content_array [Array] Array of content parts
|
|
41
|
+
# @return [String] Joined text from all text parts
|
|
42
|
+
def extract_text_from_array(content_array)
|
|
43
|
+
content_array
|
|
44
|
+
.select { |part| text_part?(part) }
|
|
45
|
+
.map { |part| extract_text_from_part(part) }
|
|
46
|
+
.join(' ')
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Check if a content part is a text part
|
|
50
|
+
# @param part [Hash] Content part
|
|
51
|
+
# @return [Boolean] true if part is a text type
|
|
52
|
+
def text_part?(part)
|
|
53
|
+
part[:type] == 'text' || part['type'] == 'text'
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Extract text from a content part
|
|
57
|
+
# @param part [Hash] Content part with text
|
|
58
|
+
# @return [String] Text content
|
|
59
|
+
def extract_text_from_part(part)
|
|
60
|
+
part[:text] || part['text'] || ''
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Format content based on whether it's a simple string or multimodal content
|
|
64
|
+
# @param prompt [String, Hash, Array] The prompt content
|
|
65
|
+
# @return [String, Array] Formatted content for the API
|
|
66
|
+
def format_content(prompt)
|
|
67
|
+
case prompt
|
|
68
|
+
when Hash
|
|
69
|
+
# Handle hash with text and/or images
|
|
70
|
+
format_multimodal_hash(prompt)
|
|
71
|
+
when Array
|
|
72
|
+
# Already formatted as array of content parts
|
|
73
|
+
prompt
|
|
74
|
+
else
|
|
75
|
+
# Simple string prompt
|
|
76
|
+
prompt.to_s
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Format a hash containing text and/or images into multimodal content array
|
|
81
|
+
# @param prompt_hash [Hash] Hash with :text and/or :images keys
|
|
82
|
+
# @return [Array] Array of content parts for the API
|
|
83
|
+
def format_multimodal_hash(prompt_hash)
|
|
84
|
+
content_parts = []
|
|
85
|
+
|
|
86
|
+
# Add image parts (order depends on provider)
|
|
87
|
+
images = prompt_hash[:images] || prompt_hash['images'] || []
|
|
88
|
+
images = [images] unless images.is_a?(Array)
|
|
89
|
+
|
|
90
|
+
if images_before_text?
|
|
91
|
+
# Anthropic recommends images before text
|
|
92
|
+
images.each { |image| content_parts << format_image_part(image) }
|
|
93
|
+
add_text_part(content_parts, prompt_hash)
|
|
94
|
+
else
|
|
95
|
+
# OpenAI/most others: text before images
|
|
96
|
+
add_text_part(content_parts, prompt_hash)
|
|
97
|
+
images.each { |image| content_parts << format_image_part(image) }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
content_parts
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Add text part to content array if present
|
|
104
|
+
# @param content_parts [Array] The content parts array
|
|
105
|
+
# @param prompt_hash [Hash] Hash with :text key
|
|
106
|
+
def add_text_part(content_parts, prompt_hash)
|
|
107
|
+
return unless prompt_hash[:text] || prompt_hash['text']
|
|
108
|
+
|
|
109
|
+
text = prompt_hash[:text] || prompt_hash['text']
|
|
110
|
+
content_parts << { type: 'text', text: }
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Format an image into the appropriate API structure
|
|
114
|
+
# This method should be overridden by clients that need different formats
|
|
115
|
+
# @param image [String, Hash] Image URL or hash with url/detail keys
|
|
116
|
+
# @return [Hash] Formatted image part for the API
|
|
117
|
+
def format_image_part(image)
|
|
118
|
+
case image
|
|
119
|
+
when String
|
|
120
|
+
format_image_url(image)
|
|
121
|
+
when Hash
|
|
122
|
+
format_image_hash(image)
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Format a simple image URL string
|
|
127
|
+
# Override this in subclasses for provider-specific format
|
|
128
|
+
# @param url [String] Image URL
|
|
129
|
+
# @return [Hash] Formatted image part
|
|
130
|
+
def format_image_url(url)
|
|
131
|
+
# Default: OpenAI format
|
|
132
|
+
{ type: 'image_url', image_url: { url: } }
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Format an image hash with url and optional detail
|
|
136
|
+
# Override this in subclasses for provider-specific format
|
|
137
|
+
# @param image_hash [Hash] Hash with url and optional detail keys
|
|
138
|
+
# @return [Hash] Formatted image part
|
|
139
|
+
def format_image_hash(image_hash)
|
|
140
|
+
# Default: OpenAI format with detail support
|
|
141
|
+
{
|
|
142
|
+
type: 'image_url',
|
|
143
|
+
image_url: {
|
|
144
|
+
url: image_hash[:url] || image_hash['url'],
|
|
145
|
+
detail: image_hash[:detail] || image_hash['detail']
|
|
146
|
+
}.compact
|
|
147
|
+
}
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Whether to place images before text in the content array
|
|
151
|
+
# Override this in subclasses if needed (e.g., Anthropic recommends images first)
|
|
152
|
+
# @return [Boolean] true if images should come before text
|
|
153
|
+
def images_before_text?
|
|
154
|
+
false
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'concerns/vision_support'
|
|
4
|
+
|
|
3
5
|
module LlmConductor
|
|
4
6
|
module Clients
|
|
5
7
|
# OpenAI GPT client implementation for accessing GPT models via OpenAI API
|
|
8
|
+
# Supports both text-only and multimodal (vision) requests
|
|
6
9
|
class GptClient < BaseClient
|
|
10
|
+
include Concerns::VisionSupport
|
|
11
|
+
|
|
7
12
|
private
|
|
8
13
|
|
|
9
14
|
def generate_content(prompt)
|
|
10
|
-
|
|
15
|
+
content = format_content(prompt)
|
|
16
|
+
client.chat(parameters: { model:, messages: [{ role: 'user', content: }] })
|
|
11
17
|
.dig('choices', 0, 'message', 'content')
|
|
12
18
|
end
|
|
13
19
|
|
|
@@ -1,32 +1,15 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'concerns/vision_support'
|
|
4
|
+
|
|
3
5
|
module LlmConductor
|
|
4
6
|
module Clients
|
|
5
7
|
# OpenRouter client implementation for accessing various LLM providers through OpenRouter API
|
|
6
8
|
# Supports both text-only and multimodal (vision) requests
|
|
7
9
|
class OpenrouterClient < BaseClient
|
|
8
|
-
|
|
10
|
+
include Concerns::VisionSupport
|
|
9
11
|
|
|
10
|
-
|
|
11
|
-
def calculate_tokens(content)
|
|
12
|
-
case content
|
|
13
|
-
when String
|
|
14
|
-
super(content)
|
|
15
|
-
when Hash
|
|
16
|
-
# For multimodal content, count tokens only for text part
|
|
17
|
-
# Note: This is an approximation as images have variable token counts
|
|
18
|
-
text = content[:text] || content['text'] || ''
|
|
19
|
-
super(text)
|
|
20
|
-
when Array
|
|
21
|
-
# For pre-formatted arrays, extract and count text parts
|
|
22
|
-
text_parts = content.select { |part| part[:type] == 'text' || part['type'] == 'text' }
|
|
23
|
-
.map { |part| part[:text] || part['text'] || '' }
|
|
24
|
-
.join(' ')
|
|
25
|
-
super(text_parts)
|
|
26
|
-
else
|
|
27
|
-
super(content.to_s)
|
|
28
|
-
end
|
|
29
|
-
end
|
|
12
|
+
private
|
|
30
13
|
|
|
31
14
|
def generate_content(prompt)
|
|
32
15
|
content = format_content(prompt)
|
|
@@ -61,66 +44,6 @@ module LlmConductor
|
|
|
61
44
|
end
|
|
62
45
|
end
|
|
63
46
|
|
|
64
|
-
# Format content based on whether it's a simple string or multimodal content
|
|
65
|
-
# @param prompt [String, Hash, Array] The prompt content
|
|
66
|
-
# @return [String, Array] Formatted content for the API
|
|
67
|
-
def format_content(prompt)
|
|
68
|
-
case prompt
|
|
69
|
-
when Hash
|
|
70
|
-
# Handle hash with text and/or images
|
|
71
|
-
format_multimodal_hash(prompt)
|
|
72
|
-
when Array
|
|
73
|
-
# Already formatted as array of content parts
|
|
74
|
-
prompt
|
|
75
|
-
else
|
|
76
|
-
# Simple string prompt
|
|
77
|
-
prompt.to_s
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
# Format a hash containing text and/or images into multimodal content array
|
|
82
|
-
# @param prompt_hash [Hash] Hash with :text and/or :images keys
|
|
83
|
-
# @return [Array] Array of content parts for the API
|
|
84
|
-
def format_multimodal_hash(prompt_hash)
|
|
85
|
-
content_parts = []
|
|
86
|
-
|
|
87
|
-
# Add text part if present
|
|
88
|
-
if prompt_hash[:text] || prompt_hash['text']
|
|
89
|
-
text = prompt_hash[:text] || prompt_hash['text']
|
|
90
|
-
content_parts << { type: 'text', text: }
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# Add image parts if present
|
|
94
|
-
images = prompt_hash[:images] || prompt_hash['images'] || []
|
|
95
|
-
images = [images] unless images.is_a?(Array)
|
|
96
|
-
|
|
97
|
-
images.each do |image|
|
|
98
|
-
content_parts << format_image_part(image)
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
content_parts
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# Format an image into the appropriate API structure
|
|
105
|
-
# @param image [String, Hash] Image URL or hash with url/detail keys
|
|
106
|
-
# @return [Hash] Formatted image part for the API
|
|
107
|
-
def format_image_part(image)
|
|
108
|
-
case image
|
|
109
|
-
when String
|
|
110
|
-
# Simple URL string
|
|
111
|
-
{ type: 'image_url', image_url: { url: image } }
|
|
112
|
-
when Hash
|
|
113
|
-
# Hash with url and optional detail level
|
|
114
|
-
{
|
|
115
|
-
type: 'image_url',
|
|
116
|
-
image_url: {
|
|
117
|
-
url: image[:url] || image['url'],
|
|
118
|
-
detail: image[:detail] || image['detail']
|
|
119
|
-
}.compact
|
|
120
|
-
}
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
|
|
124
47
|
def client
|
|
125
48
|
@client ||= begin
|
|
126
49
|
config = LlmConductor.configuration.provider_config(:openrouter)
|