llms 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +160 -0
- data/bin/llms-chat +6 -0
- data/bin/llms-test-model-access +4 -0
- data/bin/llms-test-model-image-support +4 -0
- data/bin/llms-test-model-prompt-caching +4 -0
- data/bin/llms-test-model-tool-use +5 -0
- data/lib/llms/adapters/anthropic_message_adapter.rb +73 -0
- data/lib/llms/adapters/anthropic_tool_call_adapter.rb +20 -0
- data/lib/llms/adapters/base_message_adapter.rb +60 -0
- data/lib/llms/adapters/google_gemini_message_adapter.rb +72 -0
- data/lib/llms/adapters/google_gemini_tool_call_adapter.rb +20 -0
- data/lib/llms/adapters/open_ai_compatible_message_adapter.rb +88 -0
- data/lib/llms/adapters/open_ai_compatible_tool_call_adapter.rb +67 -0
- data/lib/llms/adapters.rb +12 -0
- data/lib/llms/apis/google_gemini_api.rb +45 -0
- data/lib/llms/apis/open_ai_compatible_api.rb +54 -0
- data/lib/llms/cli/base.rb +186 -0
- data/lib/llms/cli/chat.rb +92 -0
- data/lib/llms/cli/test_access.rb +79 -0
- data/lib/llms/cli/test_image_support.rb +92 -0
- data/lib/llms/cli/test_prompt_caching.rb +275 -0
- data/lib/llms/cli/test_tool_use.rb +108 -0
- data/lib/llms/cli.rb +12 -0
- data/lib/llms/conversation.rb +100 -0
- data/lib/llms/conversation_message.rb +60 -0
- data/lib/llms/conversation_tool_call.rb +14 -0
- data/lib/llms/conversation_tool_result.rb +15 -0
- data/lib/llms/exceptions.rb +33 -0
- data/lib/llms/executors/anthropic_executor.rb +247 -0
- data/lib/llms/executors/base_executor.rb +144 -0
- data/lib/llms/executors/google_gemini_executor.rb +212 -0
- data/lib/llms/executors/hugging_face_executor.rb +17 -0
- data/lib/llms/executors/open_ai_compatible_executor.rb +209 -0
- data/lib/llms/executors.rb +52 -0
- data/lib/llms/models/model.rb +86 -0
- data/lib/llms/models/provider.rb +48 -0
- data/lib/llms/models.rb +187 -0
- data/lib/llms/parsers/anthropic_chat_response_stream_parser.rb +184 -0
- data/lib/llms/parsers/google_gemini_chat_response_stream_parser.rb +128 -0
- data/lib/llms/parsers/open_ai_compatible_chat_response_stream_parser.rb +170 -0
- data/lib/llms/parsers/partial_json_parser.rb +77 -0
- data/lib/llms/parsers/sse_chat_response_stream_parser.rb +72 -0
- data/lib/llms/public_models.json +607 -0
- data/lib/llms/stream/event_emitter.rb +48 -0
- data/lib/llms/stream/events.rb +104 -0
- data/lib/llms/usage/cost_calculator.rb +75 -0
- data/lib/llms/usage/usage_data.rb +46 -0
- data/lib/llms.rb +16 -0
- metadata +243 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
module LLMs
|
2
|
+
class ConversationToolResult
|
3
|
+
attr_reader :index, :tool_call_id, :tool_call_type, :name, :results, :is_error
|
4
|
+
|
5
|
+
def initialize(index, tool_call_id, tool_call_type, name, results, is_error)
|
6
|
+
raise "index is nil" if index.nil?
|
7
|
+
@index = index
|
8
|
+
@tool_call_id = tool_call_id
|
9
|
+
@tool_call_type = tool_call_type
|
10
|
+
@name = name
|
11
|
+
@results = results
|
12
|
+
@is_error = is_error
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module LLMs
|
2
|
+
# Base exception class for all LLMs errors
|
3
|
+
class Error < StandardError; end
|
4
|
+
|
5
|
+
# Configuration errors
|
6
|
+
class ConfigurationError < Error; end
|
7
|
+
class MissingAPIKeyError < ConfigurationError; end
|
8
|
+
class InvalidModelError < ConfigurationError; end
|
9
|
+
class UnsupportedFeatureError < ConfigurationError; end
|
10
|
+
class ModelNotFoundError < ConfigurationError; end
|
11
|
+
class ProviderNotFoundError < ConfigurationError; end
|
12
|
+
|
13
|
+
# API communication errors
|
14
|
+
class APIError < Error; end
|
15
|
+
class RateLimitError < APIError; end
|
16
|
+
class TimeoutError < APIError; end
|
17
|
+
class NetworkError < APIError; end
|
18
|
+
class AuthenticationError < APIError; end
|
19
|
+
|
20
|
+
# Usage and cost calculation errors
|
21
|
+
class UsageError < Error; end
|
22
|
+
class CostCalculationError < UsageError; end
|
23
|
+
|
24
|
+
# Tool-related errors
|
25
|
+
class ToolError < Error; end
|
26
|
+
class ToolExecutionError < ToolError; end
|
27
|
+
class ToolValidationError < ToolError; end
|
28
|
+
|
29
|
+
# Conversation and message errors
|
30
|
+
class ConversationError < Error; end
|
31
|
+
class MessageError < ConversationError; end
|
32
|
+
class InvalidMessageRoleError < MessageError; end
|
33
|
+
end
|
@@ -0,0 +1,247 @@
|
|
1
|
+
require_relative './base_executor'
|
2
|
+
require_relative '../adapters/anthropic_message_adapter'
|
3
|
+
require_relative '../parsers/anthropic_chat_response_stream_parser'
|
4
|
+
require_relative '../stream/event_emitter'
|
5
|
+
require 'anthropic'
|
6
|
+
|
7
|
+
module LLMs
|
8
|
+
module Executors
|
9
|
+
class AnthropicExecutor < BaseExecutor
|
10
|
+
|
11
|
+
def execute_conversation(conversation, &block)
|
12
|
+
if block_given?
|
13
|
+
stream_conversation(conversation) do |handler|
|
14
|
+
handler.on(:text_delta) do |event|
|
15
|
+
yield event.text
|
16
|
+
end
|
17
|
+
## TODO configure whether to yield thinking deltas
|
18
|
+
handler.on(:thinking_delta) do |event|
|
19
|
+
yield event.thinking
|
20
|
+
end
|
21
|
+
end
|
22
|
+
else
|
23
|
+
send_conversation(conversation)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def stream_conversation(conversation)
|
28
|
+
init_new_request(conversation)
|
29
|
+
|
30
|
+
emitter = Stream::EventEmitter.new
|
31
|
+
yield emitter if block_given?
|
32
|
+
|
33
|
+
start_time = Time.now
|
34
|
+
begin
|
35
|
+
http_response, stream_parsed_response = stream_client_request(emitter)
|
36
|
+
rescue Faraday::BadRequestError => e
|
37
|
+
@last_error = e.response[:body]
|
38
|
+
return nil
|
39
|
+
end
|
40
|
+
execution_time = Time.now - start_time
|
41
|
+
|
42
|
+
api_response = stream_parsed_response || http_response
|
43
|
+
|
44
|
+
if api_response['error']
|
45
|
+
@last_error = api_response['error']
|
46
|
+
return nil
|
47
|
+
end
|
48
|
+
|
49
|
+
@last_received_message_id = LLMs::Adapters::AnthropicMessageAdapter.find_message_id(api_response)
|
50
|
+
@last_received_message = LLMs::Adapters::AnthropicMessageAdapter.message_from_api_format(api_response)
|
51
|
+
@last_usage_data = calculate_usage(api_response, execution_time)
|
52
|
+
|
53
|
+
@last_received_message
|
54
|
+
end
|
55
|
+
|
56
|
+
def send_conversation(conversation)
|
57
|
+
init_new_request(conversation)
|
58
|
+
|
59
|
+
start_time = Time.now
|
60
|
+
begin
|
61
|
+
http_response = client_request
|
62
|
+
rescue Faraday::BadRequestError => e
|
63
|
+
@last_error = e.response[:body]
|
64
|
+
return nil
|
65
|
+
end
|
66
|
+
execution_time = Time.now - start_time
|
67
|
+
|
68
|
+
if http_response['error']
|
69
|
+
@last_error = http_response['error']
|
70
|
+
return nil
|
71
|
+
end
|
72
|
+
|
73
|
+
@last_received_message_id = LLMs::Adapters::AnthropicMessageAdapter.find_message_id(http_response)
|
74
|
+
@last_received_message = LLMs::Adapters::AnthropicMessageAdapter.message_from_api_format(http_response)
|
75
|
+
@last_usage_data = calculate_usage(http_response, execution_time)
|
76
|
+
|
77
|
+
@last_received_message
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def init_new_request(conversation)
|
83
|
+
@last_sent_message = conversation.last_message
|
84
|
+
@last_received_message_id = nil
|
85
|
+
@last_received_message = nil
|
86
|
+
@last_usage_data = nil
|
87
|
+
@last_error = nil
|
88
|
+
|
89
|
+
@system_prompt = conversation.system_message
|
90
|
+
@available_tools = conversation.available_tools
|
91
|
+
@formatted_messages = conversation.messages.map.with_index do |message, index|
|
92
|
+
is_last_message = index == conversation.messages.size - 1
|
93
|
+
LLMs::Adapters::AnthropicMessageAdapter.to_api_format(message, caching_enabled? && is_last_message)
|
94
|
+
end
|
95
|
+
|
96
|
+
# Figure out where to put the cache control param if no messages are provided
|
97
|
+
# In reality there should always be a message, but we'll check
|
98
|
+
if caching_enabled? && @formatted_messages.empty?
|
99
|
+
if @available_tools && @available_tools.any?
|
100
|
+
@available_tools.last[:cache_control] = {type: "ephemeral"}
|
101
|
+
elsif @system_prompt && (@system_prompt.is_a?(String) || !@system_prompt[:cache_control])
|
102
|
+
@system_prompt = {type: "text", text: @system_prompt, cache_control: {type: "ephemeral"}}
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def request_params
|
108
|
+
{
|
109
|
+
messages: @formatted_messages,
|
110
|
+
model: @model_name,
|
111
|
+
temperature: @temperature,
|
112
|
+
}.tap do |params|
|
113
|
+
if @system_prompt
|
114
|
+
params[:system] = system_param
|
115
|
+
end
|
116
|
+
if @max_tokens
|
117
|
+
params[:max_tokens] = @max_tokens
|
118
|
+
end
|
119
|
+
## Will override max_tokens if both are provided
|
120
|
+
if @max_completion_tokens
|
121
|
+
params[:max_tokens] = @max_completion_tokens
|
122
|
+
end
|
123
|
+
if @available_tools && @available_tools.any?
|
124
|
+
params[:tools] = tool_schemas
|
125
|
+
end
|
126
|
+
if @thinking_mode
|
127
|
+
params[:thinking] = { type: 'enabled' }.tap do |thinking_params|
|
128
|
+
if @max_thinking_tokens
|
129
|
+
thinking_params[:budget_tokens] = @max_thinking_tokens
|
130
|
+
else
|
131
|
+
# This is the minimum budget for thinking, and is required if thinking is enabled
|
132
|
+
thinking_params[:budget_tokens] = 1024
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def client_request
|
140
|
+
@client.messages(parameters: request_params)
|
141
|
+
end
|
142
|
+
|
143
|
+
def stream_client_request(emitter)
|
144
|
+
parser = Parsers::AnthropicChatResponseStreamParser.new(emitter)
|
145
|
+
|
146
|
+
params = request_params.merge({
|
147
|
+
stream: Proc.new do |data|
|
148
|
+
parser.handle_json(data)
|
149
|
+
end
|
150
|
+
})
|
151
|
+
|
152
|
+
[@client.messages(parameters: params), parser.full_response]
|
153
|
+
end
|
154
|
+
|
155
|
+
def initialize_client
|
156
|
+
@client = Anthropic::Client.new(access_token: fetch_api_key)
|
157
|
+
end
|
158
|
+
|
159
|
+
## TODO move to adapter
|
160
|
+
def tool_schemas
|
161
|
+
@available_tools.map do |tool|
|
162
|
+
{
|
163
|
+
name: tool.tool_schema[:name],
|
164
|
+
description: tool.tool_schema[:description],
|
165
|
+
input_schema: {
|
166
|
+
type: 'object',
|
167
|
+
properties: tool.tool_schema[:parameters][:properties],
|
168
|
+
required: tool.tool_schema[:parameters][:required]
|
169
|
+
}
|
170
|
+
}
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def system_param
|
175
|
+
@system_prompt
|
176
|
+
end
|
177
|
+
|
178
|
+
def calculate_usage(api_response, execution_time)
|
179
|
+
input_tokens = nil
|
180
|
+
output_tokens = nil
|
181
|
+
token_counts = {}
|
182
|
+
cache_was_written = nil
|
183
|
+
cache_was_read = nil
|
184
|
+
|
185
|
+
if usage = api_response['usage']
|
186
|
+
input_tokens = 0
|
187
|
+
output_tokens = 0
|
188
|
+
cache_was_written = false
|
189
|
+
cache_was_read = false
|
190
|
+
|
191
|
+
if it = usage['input_tokens']
|
192
|
+
input_tokens += it
|
193
|
+
token_counts[:input] = it
|
194
|
+
end
|
195
|
+
|
196
|
+
if ccit = usage['cache_creation_input_tokens']
|
197
|
+
input_tokens += ccit
|
198
|
+
if ccit > 0
|
199
|
+
cache_was_written = true
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
if crit = usage['cache_read_input_tokens']
|
204
|
+
input_tokens += crit
|
205
|
+
token_counts[:cache_read] = crit
|
206
|
+
if crit > 0
|
207
|
+
cache_was_read = true
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
if cache_details = usage['cache_creation']
|
212
|
+
if it1h = cache_details['ephemeral_1h_input_tokens']
|
213
|
+
token_counts[:cache_write_1hr] = it1h
|
214
|
+
if it1h > 0
|
215
|
+
cache_was_written = true
|
216
|
+
end
|
217
|
+
end
|
218
|
+
if it5min = cache_details['ephemeral_5min_input_tokens']
|
219
|
+
token_counts[:cache_write_5min] = it5min
|
220
|
+
if it5min > 0
|
221
|
+
cache_was_written = true
|
222
|
+
end
|
223
|
+
end
|
224
|
+
elsif ccit = usage['cache_creation_input_tokens']
|
225
|
+
# if no details, all caching is 5min
|
226
|
+
token_counts[:cache_write_5min] = ccit
|
227
|
+
end
|
228
|
+
|
229
|
+
if ot = usage['output_tokens']
|
230
|
+
output_tokens += ot
|
231
|
+
token_counts[:output] = ot
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
{
|
236
|
+
input_tokens: input_tokens,
|
237
|
+
output_tokens: output_tokens,
|
238
|
+
cache_was_written: cache_was_written,
|
239
|
+
cache_was_read: cache_was_read,
|
240
|
+
token_details: token_counts,
|
241
|
+
execution_time: execution_time,
|
242
|
+
estimated_cost: calculate_cost(token_counts)
|
243
|
+
}
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
require_relative '../conversation'
|
2
|
+
require_relative '../models'
|
3
|
+
require_relative '../exceptions'
|
4
|
+
|
5
|
+
module LLMs
|
6
|
+
module Executors
|
7
|
+
class BaseExecutor
|
8
|
+
|
9
|
+
DEFAULT_TEMPERATURE = 0.0
|
10
|
+
|
11
|
+
attr_reader :client, :model_name, :base_url,
|
12
|
+
:system_prompt, :available_tools, :temperature,
|
13
|
+
:max_tokens, :max_completion_tokens, :max_thinking_tokens,
|
14
|
+
:thinking_mode, :thinking_effort,
|
15
|
+
:last_sent_message, :last_received_message_id, :last_received_message, :last_usage_data, :last_error
|
16
|
+
|
17
|
+
def initialize(**params)
|
18
|
+
raise LLMs::ConfigurationError, "model_name is required" if params[:model_name].nil?
|
19
|
+
|
20
|
+
@model_name = params[:model_name]
|
21
|
+
|
22
|
+
# Connection Info
|
23
|
+
@base_url = params[:base_url]
|
24
|
+
@api_key = params[:api_key] ## Will take precedence over an env var if present
|
25
|
+
@api_key_env_var = params[:api_key_env_var]
|
26
|
+
@pricing = params[:pricing]
|
27
|
+
@exclude_params = params[:exclude_params]
|
28
|
+
|
29
|
+
# Execution Info
|
30
|
+
@system_prompt = params[:system_prompt]
|
31
|
+
@temperature = validate_temperature(params[:temperature] || DEFAULT_TEMPERATURE)
|
32
|
+
@available_tools = params[:tools]
|
33
|
+
|
34
|
+
@cache_prompt = params[:cache_prompt] ## TODO caching is automatic for most models now
|
35
|
+
|
36
|
+
@max_tokens = validate_positive_integer_or_nil(params[:max_tokens], "max_tokens")
|
37
|
+
@max_completion_tokens = validate_positive_integer_or_nil(params[:max_completion_tokens], "max_completion_tokens")
|
38
|
+
@max_thinking_tokens = validate_positive_integer_or_nil(params[:max_thinking_tokens], "max_thinking_tokens")
|
39
|
+
|
40
|
+
@thinking_mode = params.key?(:thinking) ? params[:thinking] : false
|
41
|
+
@thinking_effort = validate_thinking_effort(params[:thinking_effort])
|
42
|
+
|
43
|
+
##TODO warn if max_tokens is used instead of max_completion_tokens and model is a thinking model (or thinking is enabled)
|
44
|
+
|
45
|
+
@last_sent_message = nil
|
46
|
+
@last_received_message = nil
|
47
|
+
@last_usage_data = nil
|
48
|
+
@last_error = nil
|
49
|
+
|
50
|
+
initialize_client
|
51
|
+
end
|
52
|
+
|
53
|
+
def execute_prompt(prompt, system_prompt: nil, &block)
|
54
|
+
conversation = LLMs::Conversation.new
|
55
|
+
if sp = system_prompt || @system_prompt
|
56
|
+
conversation.set_system_message(sp)
|
57
|
+
end
|
58
|
+
conversation.add_user_message(prompt)
|
59
|
+
response_message = self.execute_conversation(conversation, &block)
|
60
|
+
response_message&.text
|
61
|
+
end
|
62
|
+
|
63
|
+
def execute_conversation(conversation, &block)
|
64
|
+
raise NotImplementedError, "Subclasses must implement execute_conversation"
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def fetch_api_key
|
70
|
+
if @api_key
|
71
|
+
@api_key
|
72
|
+
elsif @api_key_env_var
|
73
|
+
ENV[@api_key_env_var] || raise("#{@api_key_env_var} not set")
|
74
|
+
else
|
75
|
+
raise LLMs::ConfigurationError, "No API key provided"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def initialize_client
|
80
|
+
raise NotImplementedError, "Subclasses must implement initialize_client"
|
81
|
+
end
|
82
|
+
|
83
|
+
def tool_schemas
|
84
|
+
raise NotImplementedError, "Subclasses must implement tool_schemas"
|
85
|
+
end
|
86
|
+
|
87
|
+
def caching_enabled?
|
88
|
+
@cache_prompt == true ##TODO caching is automatic by default now for non-Anthropic models that support it
|
89
|
+
end
|
90
|
+
|
91
|
+
def calculate_usage(response)
|
92
|
+
raise NotImplementedError, "Subclasses must implement calculate_usage"
|
93
|
+
end
|
94
|
+
|
95
|
+
def validate_thinking_effort(effort)
|
96
|
+
return if effort.nil?
|
97
|
+
|
98
|
+
if effort.to_s.in?(%w[low medium high])
|
99
|
+
effort.to_s
|
100
|
+
else
|
101
|
+
raise LLMs::ConfigurationError, "Thinking effort must be a string 'low', 'medium', or 'high', got: #{effort}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def validate_temperature(temp)
|
106
|
+
unless temp.is_a?(Numeric) && temp >= 0.0 && temp <= 2.0
|
107
|
+
raise LLMs::ConfigurationError, "Temperature must be a number between 0.0 and 2.0, got: #{temp}"
|
108
|
+
end
|
109
|
+
temp
|
110
|
+
end
|
111
|
+
|
112
|
+
def validate_positive_integer_or_nil(tokens, name)
|
113
|
+
unless tokens.nil? || (tokens.is_a?(Integer) && tokens > 0)
|
114
|
+
raise LLMs::ConfigurationError, "#{name} must be a positive integer, got: #{tokens}"
|
115
|
+
end
|
116
|
+
tokens
|
117
|
+
end
|
118
|
+
|
119
|
+
def calculate_cost(token_counts)
|
120
|
+
return nil unless @pricing
|
121
|
+
return nil if token_counts.nil? || token_counts.empty?
|
122
|
+
|
123
|
+
token_keys = token_counts.keys.map(&:to_s)
|
124
|
+
pricing_keys = @pricing.keys.map(&:to_s)
|
125
|
+
|
126
|
+
missing_keys = token_keys - pricing_keys
|
127
|
+
## TODO remove this special case. Is it safe to skip all missing keys if the token count is zero for them?
|
128
|
+
unless missing_keys.empty? || (missing_keys.include?('cached_input') && token_counts[:cached_input] == 0)
|
129
|
+
raise LLMs::CostCalculationError, "Pricing missing key: #{missing_keys.join(', ')}"
|
130
|
+
end
|
131
|
+
|
132
|
+
token_keys.reduce(0.0) do |sum, k|
|
133
|
+
key = k.to_sym
|
134
|
+
if token_counts[key] && token_counts[key] > 0 && @pricing[key]
|
135
|
+
sum + (token_counts[key].to_f / 1_000_000.0) * @pricing[key]
|
136
|
+
else
|
137
|
+
sum
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,212 @@
|
|
1
|
+
require_relative 'base_executor'
|
2
|
+
require_relative '../apis/google_gemini_api'
|
3
|
+
require_relative '../parsers/google_gemini_chat_response_stream_parser'
|
4
|
+
require_relative '../adapters/google_gemini_message_adapter'
|
5
|
+
|
6
|
+
module LLMs
|
7
|
+
module Executors
|
8
|
+
class GoogleGeminiExecutor < BaseExecutor
|
9
|
+
|
10
|
+
def execute_conversation(conversation, &block)
|
11
|
+
if block_given?
|
12
|
+
stream_conversation(conversation) do |handler|
|
13
|
+
handler.on(:text_delta) do |event|
|
14
|
+
yield event.text
|
15
|
+
end
|
16
|
+
end
|
17
|
+
else
|
18
|
+
send_conversation(conversation)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def stream_conversation(conversation)
|
23
|
+
init_new_request(conversation)
|
24
|
+
|
25
|
+
emitter = Stream::EventEmitter.new
|
26
|
+
yield emitter if block_given?
|
27
|
+
|
28
|
+
start_time = Time.now
|
29
|
+
begin
|
30
|
+
http_response, stream_parsed_response = stream_client_request(emitter)
|
31
|
+
rescue StandardError => e
|
32
|
+
@last_error = {'error' => e.message, 'backtrace' => e.backtrace}
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
execution_time = Time.now - start_time
|
36
|
+
|
37
|
+
if http_response && http_response['error']
|
38
|
+
@last_error = http_response
|
39
|
+
return nil
|
40
|
+
end
|
41
|
+
|
42
|
+
response_data = stream_parsed_response || http_response
|
43
|
+
|
44
|
+
@last_received_message = LLMs::Adapters::GoogleGeminiMessageAdapter.message_from_api_format(response_data)
|
45
|
+
@last_usage_data = calculate_usage(response_data, execution_time)
|
46
|
+
|
47
|
+
@last_received_message
|
48
|
+
end
|
49
|
+
|
50
|
+
def send_conversation(conversation)
|
51
|
+
init_new_request(conversation)
|
52
|
+
|
53
|
+
start_time = Time.now
|
54
|
+
begin
|
55
|
+
http_response = client_request
|
56
|
+
rescue StandardError => e
|
57
|
+
@last_error = {'error' => e.message, 'backtrace' => e.backtrace}
|
58
|
+
return nil
|
59
|
+
end
|
60
|
+
execution_time = Time.now - start_time
|
61
|
+
|
62
|
+
if http_response && http_response['error']
|
63
|
+
@last_error = http_response
|
64
|
+
return nil
|
65
|
+
end
|
66
|
+
|
67
|
+
@last_received_message = LLMs::Adapters::GoogleGeminiMessageAdapter.message_from_api_format(http_response)
|
68
|
+
if @last_received_message.nil?
|
69
|
+
@last_error = {'error' => 'No message found in the response. Can happen with thinking models if max_tokens is too low.'}
|
70
|
+
return nil
|
71
|
+
end
|
72
|
+
|
73
|
+
@last_received_message_id = "gemini-#{Time.now.to_i}" ## no message id in the response
|
74
|
+
@last_usage_data = calculate_usage(http_response, execution_time)
|
75
|
+
|
76
|
+
|
77
|
+
@last_received_message
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def init_new_request(conversation)
|
83
|
+
@last_sent_message = conversation.last_message
|
84
|
+
@last_received_message = nil
|
85
|
+
@last_usage_data = nil
|
86
|
+
@last_error = nil
|
87
|
+
|
88
|
+
@formatted_messages = conversation.messages.map do |message|
|
89
|
+
LLMs::Adapters::GoogleGeminiMessageAdapter.to_api_format(message)
|
90
|
+
end
|
91
|
+
|
92
|
+
@available_tools = conversation.available_tools
|
93
|
+
end
|
94
|
+
|
95
|
+
## TODO we are not inserting fake message_ids nor fake tool_call_ids for the response data, only in stremed events
|
96
|
+
|
97
|
+
def client_request
|
98
|
+
@client.generate_content(@model_name, @formatted_messages, request_params)
|
99
|
+
end
|
100
|
+
|
101
|
+
def stream_client_request(emitter)
|
102
|
+
parser = Parsers::GoogleGeminiChatResponseStreamParser.new(emitter)
|
103
|
+
|
104
|
+
params = request_params.merge(stream: Proc.new { |chunk| parser.add_data(chunk) })
|
105
|
+
http_response = @client.generate_content(@model_name, @formatted_messages, params)
|
106
|
+
|
107
|
+
@last_received_message_id = parser.current_message_id ##no message id in the response
|
108
|
+
|
109
|
+
[http_response, parser.full_response]
|
110
|
+
end
|
111
|
+
|
112
|
+
def request_params
|
113
|
+
generation_config = { temperature: @temperature }.tap do |config|
|
114
|
+
if @max_tokens
|
115
|
+
config[:maxOutputTokens] = @max_tokens
|
116
|
+
end
|
117
|
+
# Will override max_tokens if both are provided
|
118
|
+
if @max_completion_tokens
|
119
|
+
config[:maxOutputTokens] = @max_completion_tokens
|
120
|
+
end
|
121
|
+
if @thinking_mode
|
122
|
+
config[:thinkingConfig] = { includeThoughts: true }.tap do |thinking_config|
|
123
|
+
if @max_thinking_tokens
|
124
|
+
thinking_config[:thinkingBudget] = @max_thinking_tokens
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
{ generationConfig: generation_config }.tap do |params|
|
131
|
+
if @system_prompt
|
132
|
+
params[:system_instruction] = { parts: [{text: @system_prompt}] }
|
133
|
+
end
|
134
|
+
if @available_tools && @available_tools.any?
|
135
|
+
params[:tools] = tool_schemas
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def tool_schemas
|
141
|
+
[
|
142
|
+
{
|
143
|
+
function_declarations: @available_tools.map do |tool|
|
144
|
+
{
|
145
|
+
name: tool.tool_schema[:name],
|
146
|
+
description: tool.tool_schema[:description],
|
147
|
+
parameters: tool.tool_schema[:parameters]
|
148
|
+
}
|
149
|
+
end
|
150
|
+
}
|
151
|
+
]
|
152
|
+
end
|
153
|
+
|
154
|
+
def calculate_usage(response, execution_time)
|
155
|
+
input_tokens = nil
|
156
|
+
output_tokens = nil
|
157
|
+
cache_was_written = nil
|
158
|
+
cache_was_read = nil
|
159
|
+
token_counts = {}
|
160
|
+
|
161
|
+
## TODO cache write is never reported in usageMetadata so we can't calculate it's cost
|
162
|
+
## Maybe there is no cost for implicit caching?
|
163
|
+
## TODO support explicit caching
|
164
|
+
|
165
|
+
if usage_metadata = response['usageMetadata']
|
166
|
+
input_tokens = 0
|
167
|
+
output_tokens = 0
|
168
|
+
cache_was_read = false
|
169
|
+
|
170
|
+
if ptc = usage_metadata['promptTokenCount']
|
171
|
+
input_tokens += ptc
|
172
|
+
token_counts[:input] = ptc
|
173
|
+
end
|
174
|
+
|
175
|
+
if cctc = usage_metadata['cachedContentTokenCount']
|
176
|
+
cache_was_read = true
|
177
|
+
token_counts[:cache_read] = cctc
|
178
|
+
if token_counts[:input]
|
179
|
+
token_counts[:input] -= cctc
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
if otc = usage_metadata['thoughtsTokenCount']
|
184
|
+
output_tokens += otc
|
185
|
+
token_counts[:output] = otc
|
186
|
+
end
|
187
|
+
|
188
|
+
if ctc = usage_metadata['candidatesTokenCount']
|
189
|
+
output_tokens += ctc
|
190
|
+
token_counts[:output] ||= 0
|
191
|
+
token_counts[:output] += ctc
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
{
|
196
|
+
input_tokens: input_tokens,
|
197
|
+
output_tokens: output_tokens,
|
198
|
+
cache_was_written: cache_was_written,
|
199
|
+
cache_was_read: cache_was_read,
|
200
|
+
token_details: token_counts,
|
201
|
+
execution_time: execution_time,
|
202
|
+
estimated_cost: calculate_cost(token_counts)
|
203
|
+
}
|
204
|
+
end
|
205
|
+
|
206
|
+
def initialize_client
|
207
|
+
@client = LLMs::APIs::GoogleGeminiAPI.new(fetch_api_key)
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require_relative './open_ai_compatible_executor'
|
2
|
+
|
3
|
+
module LLMs
|
4
|
+
module Executors
|
5
|
+
class HuggingFaceExecutor < OpenAICompatibleExecutor
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
## TODO remove need for this class (by supporting e.g. a base_url_template param?)
|
10
|
+
def initialize_client
|
11
|
+
@base_url = "https://api-inference.huggingface.co/models/#{@model_name}/v1"
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|