ruby_llm-agents 3.2.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/app/views/ruby_llm/agents/executions/_audio_player.html.erb +57 -0
- data/app/views/ruby_llm/agents/executions/show.html.erb +8 -0
- data/lib/ruby_llm/agents/audio/speaker/active_storage_support.rb +87 -0
- data/lib/ruby_llm/agents/audio/speaker.rb +10 -0
- data/lib/ruby_llm/agents/audio/transcriber.rb +10 -0
- data/lib/ruby_llm/agents/core/configuration.rb +5 -1
- data/lib/ruby_llm/agents/core/version.rb +1 -1
- data/lib/ruby_llm/agents/pipeline/middleware/instrumentation.rb +49 -0
- data/lib/ruby_llm/agents/results/speech_result.rb +12 -7
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 463487c17c50bf1496a30c9eea51dab3c334a17010853da97da5624a6cf564b5
|
|
4
|
+
data.tar.gz: 470a6666266d17dc8190f5118eec0c5f674fb51bae8b368d8713ea65d1882025
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b1e2d4688dfc294c3b94c95df084a248fa25fbcd7d99910f31f72b85138bf37a392a8b350462ee341d5d209674b844a9d2692a177db30845857a586fa77ce3bc
|
|
7
|
+
data.tar.gz: 50130237011f12c808a073d55b9083ce8449f8e0ddf8dd800c13134104f48233ea4f0fac3ddcbcc9a8b45b91814f67db5c57632dd6d54930c1ffd19fda825e96
|
data/README.md
CHANGED
|
@@ -135,7 +135,7 @@ result.save("logo.png")
|
|
|
135
135
|
| **Attachments** | Images, PDFs, and multimodal support | [Attachments](https://github.com/adham90/ruby_llm-agents/wiki/Attachments) |
|
|
136
136
|
| **Embeddings** | Vector embeddings with batching, caching, and preprocessing | [Embeddings](https://github.com/adham90/ruby_llm-agents/wiki/Embeddings) |
|
|
137
137
|
| **Image Operations** | Generation, analysis, editing, pipelines with cost tracking | [Images](https://github.com/adham90/ruby_llm-agents/wiki/Image-Generation) |
|
|
138
|
-
| **Audio** | Text-to-speech (OpenAI, ElevenLabs)
|
|
138
|
+
| **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
|
|
139
139
|
| **Alerts** | Slack, webhook, and custom notifications | [Alerts](https://github.com/adham90/ruby_llm-agents/wiki/Alerts) |
|
|
140
140
|
|
|
141
141
|
## Quick Start
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
<%
|
|
2
|
+
response = @execution.response || {}
|
|
3
|
+
audio_src = response["audio_url"] || response[:audio_url] ||
|
|
4
|
+
response["audio_data_uri"] || response[:audio_data_uri]
|
|
5
|
+
audio_format = response["format"] || response[:format] ||
|
|
6
|
+
@execution.metadata&.dig("audio_format")
|
|
7
|
+
audio_duration = response["duration"] || response[:duration] ||
|
|
8
|
+
@execution.metadata&.dig("audio_duration_seconds")
|
|
9
|
+
audio_file_size = response["file_size"] || response[:file_size] ||
|
|
10
|
+
@execution.metadata&.dig("audio_file_size_bytes")
|
|
11
|
+
audio_voice = response["voice_id"] || response[:voice_id] ||
|
|
12
|
+
@execution.metadata&.dig("voice_id")
|
|
13
|
+
audio_provider = response["provider"] || response[:provider] ||
|
|
14
|
+
@execution.metadata&.dig("audio_provider")
|
|
15
|
+
audio_characters = @execution.metadata&.dig("audio_characters")
|
|
16
|
+
%>
|
|
17
|
+
|
|
18
|
+
<div class="flex items-center gap-3 mt-6 mb-3">
|
|
19
|
+
<span class="text-[10px] font-medium text-gray-400 dark:text-gray-600 uppercase tracking-widest font-mono">audio</span>
|
|
20
|
+
<div class="flex-1 border-t border-gray-200 dark:border-gray-800"></div>
|
|
21
|
+
</div>
|
|
22
|
+
|
|
23
|
+
<% if audio_src.present? %>
|
|
24
|
+
<div class="mb-3">
|
|
25
|
+
<audio controls preload="metadata" class="w-full max-w-lg" style="height: 36px;">
|
|
26
|
+
<source src="<%= audio_src %>">
|
|
27
|
+
Your browser does not support the audio element.
|
|
28
|
+
</audio>
|
|
29
|
+
</div>
|
|
30
|
+
<% end %>
|
|
31
|
+
|
|
32
|
+
<div class="flex flex-wrap items-center gap-x-4 gap-y-1 font-mono text-xs text-gray-400 dark:text-gray-500">
|
|
33
|
+
<% if audio_duration.present? %>
|
|
34
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= audio_duration.is_a?(Numeric) ? "#{audio_duration.round(1)}s" : audio_duration %></span> duration</span>
|
|
35
|
+
<% end %>
|
|
36
|
+
<% if audio_format.present? %>
|
|
37
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= audio_format %></span> format</span>
|
|
38
|
+
<% end %>
|
|
39
|
+
<% if audio_file_size.present? %>
|
|
40
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= number_to_human_size(audio_file_size) %></span> size</span>
|
|
41
|
+
<% end %>
|
|
42
|
+
<% if audio_voice.present? %>
|
|
43
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= audio_voice %></span> voice</span>
|
|
44
|
+
<% end %>
|
|
45
|
+
<% if audio_provider.present? %>
|
|
46
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= audio_provider %></span> provider</span>
|
|
47
|
+
<% end %>
|
|
48
|
+
<% if audio_characters.present? %>
|
|
49
|
+
<span><span class="text-gray-800 dark:text-gray-200"><%= number_to_human_short(audio_characters) %></span> characters</span>
|
|
50
|
+
<% end %>
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
<% if audio_src.blank? %>
|
|
54
|
+
<p class="text-xs text-gray-400 dark:text-gray-600 font-mono mt-2 italic">
|
|
55
|
+
No audio data stored. Enable <code class="text-gray-500 dark:text-gray-400">persist_audio_data</code> in config to play back Speaker audio here.
|
|
56
|
+
</p>
|
|
57
|
+
<% end %>
|
|
@@ -57,6 +57,14 @@
|
|
|
57
57
|
<% end %>
|
|
58
58
|
</div>
|
|
59
59
|
|
|
60
|
+
<!-- ── audio player ──────────────────── -->
|
|
61
|
+
<% if @execution.agent_type.to_s.match?(/Speaker|Narrator|Transcriber/i) ||
|
|
62
|
+
@execution.metadata&.dig("audio_duration_seconds").present? ||
|
|
63
|
+
@execution.response&.dig("audio_data_uri").present? ||
|
|
64
|
+
@execution.response&.dig("audio_url").present? %>
|
|
65
|
+
<%= render "ruby_llm/agents/executions/audio_player" %>
|
|
66
|
+
<% end %>
|
|
67
|
+
|
|
60
68
|
<!-- ── tokens ──────────────────────── -->
|
|
61
69
|
<%
|
|
62
70
|
input_tokens = @execution.input_tokens || 0
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
class Speaker
|
|
6
|
+
# ActiveStorage integration for speakers
|
|
7
|
+
#
|
|
8
|
+
# Provides convenience methods for generating audio and directly
|
|
9
|
+
# attaching it to ActiveStorage attachments.
|
|
10
|
+
#
|
|
11
|
+
# @example Attaching to a model
|
|
12
|
+
# class Article < ApplicationRecord
|
|
13
|
+
# has_one_attached :narration
|
|
14
|
+
# end
|
|
15
|
+
#
|
|
16
|
+
# class ArticleNarrator < RubyLLM::Agents::Speaker
|
|
17
|
+
# include RubyLLM::Agents::Speaker::ActiveStorageSupport
|
|
18
|
+
#
|
|
19
|
+
# provider :openai
|
|
20
|
+
# model 'tts-1-hd'
|
|
21
|
+
# voice 'nova'
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
# article = Article.find(1)
|
|
25
|
+
# result = ArticleNarrator.speak_and_attach(
|
|
26
|
+
# text: article.body,
|
|
27
|
+
# record: article,
|
|
28
|
+
# attachment_name: :narration
|
|
29
|
+
# )
|
|
30
|
+
#
|
|
31
|
+
module ActiveStorageSupport
|
|
32
|
+
extend ActiveSupport::Concern
|
|
33
|
+
|
|
34
|
+
class_methods do
|
|
35
|
+
# Generate audio and attach it to a record
|
|
36
|
+
#
|
|
37
|
+
# @param text [String] Text to convert to speech
|
|
38
|
+
# @param record [ActiveRecord::Base] The record to attach to
|
|
39
|
+
# @param attachment_name [Symbol] Name of the attachment (e.g., :narration)
|
|
40
|
+
# @param options [Hash] Additional options for generation
|
|
41
|
+
# @return [SpeechResult] The speech result with audio_url set
|
|
42
|
+
def speak_and_attach(text:, record:, attachment_name:, **options)
|
|
43
|
+
result = call(text: text, **options)
|
|
44
|
+
|
|
45
|
+
return result unless result.success?
|
|
46
|
+
|
|
47
|
+
attach_audio_to_record(result, record, attachment_name, options)
|
|
48
|
+
|
|
49
|
+
result
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def attach_audio_to_record(result, record, attachment_name, options)
|
|
55
|
+
attachment = record.public_send(attachment_name)
|
|
56
|
+
filename = options[:filename] || generate_audio_filename(result)
|
|
57
|
+
|
|
58
|
+
attachment.attach(
|
|
59
|
+
io: StringIO.new(result.audio),
|
|
60
|
+
filename: filename,
|
|
61
|
+
content_type: result.content_type
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
result.audio_key = attachment.blob.key if attachment.respond_to?(:blob) && attachment.blob
|
|
65
|
+
result.audio_url = blob_url(attachment) if attachment.respond_to?(:blob) && attachment.blob
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def blob_url(attachment)
|
|
69
|
+
if attachment.blob.respond_to?(:url)
|
|
70
|
+
attachment.blob.url
|
|
71
|
+
elsif attachment.blob.respond_to?(:service_url)
|
|
72
|
+
attachment.blob.service_url
|
|
73
|
+
end
|
|
74
|
+
rescue => _e
|
|
75
|
+
nil
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def generate_audio_filename(result)
|
|
79
|
+
timestamp = Time.current.to_i
|
|
80
|
+
ext = result.format || :mp3
|
|
81
|
+
"speech_#{timestamp}.#{ext}"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -336,6 +336,14 @@ module RubyLLM
|
|
|
336
336
|
context.output_tokens = 0
|
|
337
337
|
context.total_cost = calculate_cost(result)
|
|
338
338
|
|
|
339
|
+
# Store audio-specific metadata for execution tracking
|
|
340
|
+
context[:provider] = result[:provider].to_s
|
|
341
|
+
context[:voice_id] = (resolved_voice_id || resolved_voice).to_s
|
|
342
|
+
context[:characters] = result[:characters]
|
|
343
|
+
context[:output_format] = result[:format].to_s
|
|
344
|
+
context[:file_size] = result[:audio]&.bytesize
|
|
345
|
+
context[:audio_duration_seconds] = result[:duration] if result[:duration]
|
|
346
|
+
|
|
339
347
|
# Build final result
|
|
340
348
|
context.output = build_result(
|
|
341
349
|
result,
|
|
@@ -559,3 +567,5 @@ module RubyLLM
|
|
|
559
567
|
end
|
|
560
568
|
end
|
|
561
569
|
end
|
|
570
|
+
|
|
571
|
+
require_relative "speaker/active_storage_support"
|
|
@@ -318,6 +318,16 @@ module RubyLLM
|
|
|
318
318
|
context.output_tokens = 0
|
|
319
319
|
context.total_cost = calculate_cost(raw_result)
|
|
320
320
|
|
|
321
|
+
# Store transcription-specific metadata for execution tracking
|
|
322
|
+
context[:language] = resolved_language if resolved_language
|
|
323
|
+
context[:detected_language] = raw_result[:language] if raw_result[:language]
|
|
324
|
+
context[:audio_duration_seconds] = raw_result[:duration] if raw_result[:duration]
|
|
325
|
+
context[:audio_minutes] = (raw_result[:duration] / 60.0).round(4) if raw_result[:duration]
|
|
326
|
+
context[:output_format] = self.class.output_format.to_s
|
|
327
|
+
context[:timestamp_granularity] = self.class.include_timestamps.to_s
|
|
328
|
+
context[:segment_count] = raw_result[:segments]&.size if raw_result[:segments]
|
|
329
|
+
context[:word_count] = raw_result[:text]&.split(/\s+/)&.size if raw_result[:text]
|
|
330
|
+
|
|
321
331
|
# Build final result
|
|
322
332
|
context.output = build_result(
|
|
323
333
|
raw_result,
|
|
@@ -452,7 +452,8 @@ module RubyLLM
|
|
|
452
452
|
:root_directory,
|
|
453
453
|
:root_namespace,
|
|
454
454
|
:tool_result_max_length,
|
|
455
|
-
:redaction
|
|
455
|
+
:redaction,
|
|
456
|
+
:persist_audio_data
|
|
456
457
|
|
|
457
458
|
# Attributes with validation (readers only, custom setters below)
|
|
458
459
|
attr_reader :default_temperature,
|
|
@@ -734,6 +735,9 @@ module RubyLLM
|
|
|
734
735
|
|
|
735
736
|
# Redaction defaults (disabled by default)
|
|
736
737
|
@redaction = nil
|
|
738
|
+
|
|
739
|
+
# Audio data persistence (disabled by default — base64 audio can be large)
|
|
740
|
+
@persist_audio_data = false
|
|
737
741
|
end
|
|
738
742
|
|
|
739
743
|
# Returns the configured cache store, falling back to Rails.cache
|
|
@@ -280,6 +280,9 @@ module RubyLLM
|
|
|
280
280
|
detail_data[:response] = serialize_response(context)
|
|
281
281
|
end
|
|
282
282
|
|
|
283
|
+
# Persist audio data for Speaker executions
|
|
284
|
+
maybe_persist_audio_response(context, detail_data)
|
|
285
|
+
|
|
283
286
|
has_data = detail_data.values.any? { |v| v.present? && v != {} && v != [] }
|
|
284
287
|
return unless has_data
|
|
285
288
|
|
|
@@ -376,6 +379,10 @@ module RubyLLM
|
|
|
376
379
|
if global_config.persist_responses && context.output.respond_to?(:content)
|
|
377
380
|
detail_data[:response] = serialize_response(context)
|
|
378
381
|
end
|
|
382
|
+
|
|
383
|
+
# Persist audio data for Speaker executions
|
|
384
|
+
maybe_persist_audio_response(context, detail_data)
|
|
385
|
+
|
|
379
386
|
data[:_detail_data] = detail_data
|
|
380
387
|
|
|
381
388
|
data
|
|
@@ -463,6 +470,48 @@ module RubyLLM
|
|
|
463
470
|
nil
|
|
464
471
|
end
|
|
465
472
|
|
|
473
|
+
# Persists audio response data for Speaker executions
|
|
474
|
+
#
|
|
475
|
+
# When persist_audio_data is enabled and the output is a SpeechResult with
|
|
476
|
+
# audio binary data, stores a base64 data URI in the response column.
|
|
477
|
+
# Always stores audio_url if present (lightweight, no binary).
|
|
478
|
+
#
|
|
479
|
+
# @param context [Context] The execution context
|
|
480
|
+
# @param detail_data [Hash] The detail data hash to modify
|
|
481
|
+
def maybe_persist_audio_response(context, detail_data)
|
|
482
|
+
return unless context.output.is_a?(RubyLLM::Agents::SpeechResult)
|
|
483
|
+
|
|
484
|
+
# Always persist audio_url if present (it's just a string, no binary)
|
|
485
|
+
if context.output.audio_url.present?
|
|
486
|
+
detail_data[:response] ||= {}
|
|
487
|
+
detail_data[:response][:audio_url] = context.output.audio_url
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
# Persist full audio data URI only when opted in
|
|
491
|
+
return unless global_config.respond_to?(:persist_audio_data) && global_config.persist_audio_data
|
|
492
|
+
return unless context.output.audio.present?
|
|
493
|
+
|
|
494
|
+
detail_data[:response] = serialize_audio_response(context.output)
|
|
495
|
+
rescue => e
|
|
496
|
+
error("Failed to persist audio response: #{e.message}")
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
# Serializes a SpeechResult into a hash for the response column
|
|
500
|
+
#
|
|
501
|
+
# @param result [SpeechResult] The speech result to serialize
|
|
502
|
+
# @return [Hash] Serialized audio response data
|
|
503
|
+
def serialize_audio_response(result)
|
|
504
|
+
{
|
|
505
|
+
audio_data_uri: result.to_data_uri,
|
|
506
|
+
audio_url: result.audio_url,
|
|
507
|
+
format: result.format.to_s,
|
|
508
|
+
duration: result.duration,
|
|
509
|
+
file_size: result.file_size,
|
|
510
|
+
voice_id: result.voice_id,
|
|
511
|
+
provider: result.provider.to_s
|
|
512
|
+
}.compact
|
|
513
|
+
end
|
|
514
|
+
|
|
466
515
|
# Queues async logging via background job
|
|
467
516
|
#
|
|
468
517
|
# @param data [Hash] Execution data
|
|
@@ -29,17 +29,17 @@ module RubyLLM
|
|
|
29
29
|
# @return [String, nil] Binary audio data
|
|
30
30
|
attr_reader :audio
|
|
31
31
|
|
|
32
|
-
# @!attribute [
|
|
32
|
+
# @!attribute [rw] audio_url
|
|
33
33
|
# @return [String, nil] URL if audio was stored remotely
|
|
34
|
-
|
|
34
|
+
attr_accessor :audio_url
|
|
35
35
|
|
|
36
|
-
# @!attribute [
|
|
36
|
+
# @!attribute [rw] audio_key
|
|
37
37
|
# @return [String, nil] Storage key if stored
|
|
38
|
-
|
|
38
|
+
attr_accessor :audio_key
|
|
39
39
|
|
|
40
|
-
# @!attribute [
|
|
40
|
+
# @!attribute [rw] audio_path
|
|
41
41
|
# @return [String, nil] Local file path if saved
|
|
42
|
-
|
|
42
|
+
attr_accessor :audio_path
|
|
43
43
|
|
|
44
44
|
# @!endgroup
|
|
45
45
|
|
|
@@ -308,7 +308,12 @@ module RubyLLM
|
|
|
308
308
|
}
|
|
309
309
|
end
|
|
310
310
|
|
|
311
|
-
|
|
311
|
+
# Returns MIME type for the audio format
|
|
312
|
+
#
|
|
313
|
+
# @return [String] MIME type
|
|
314
|
+
def content_type
|
|
315
|
+
mime_type_for_format
|
|
316
|
+
end
|
|
312
317
|
|
|
313
318
|
# Returns MIME type for the audio format
|
|
314
319
|
#
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_llm-agents
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.
|
|
4
|
+
version: 3.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- adham90
|
|
@@ -111,6 +111,7 @@ files:
|
|
|
111
111
|
- app/views/ruby_llm/agents/dashboard/_action_center.html.erb
|
|
112
112
|
- app/views/ruby_llm/agents/dashboard/_tenant_budget.html.erb
|
|
113
113
|
- app/views/ruby_llm/agents/dashboard/index.html.erb
|
|
114
|
+
- app/views/ruby_llm/agents/executions/_audio_player.html.erb
|
|
114
115
|
- app/views/ruby_llm/agents/executions/_execution.html.erb
|
|
115
116
|
- app/views/ruby_llm/agents/executions/_filters.html.erb
|
|
116
117
|
- app/views/ruby_llm/agents/executions/_list.html.erb
|
|
@@ -209,6 +210,7 @@ files:
|
|
|
209
210
|
- lib/ruby_llm-agents.rb
|
|
210
211
|
- lib/ruby_llm/agents.rb
|
|
211
212
|
- lib/ruby_llm/agents/audio/speaker.rb
|
|
213
|
+
- lib/ruby_llm/agents/audio/speaker/active_storage_support.rb
|
|
212
214
|
- lib/ruby_llm/agents/audio/speech_client.rb
|
|
213
215
|
- lib/ruby_llm/agents/audio/speech_pricing.rb
|
|
214
216
|
- lib/ruby_llm/agents/audio/transcriber.rb
|