ruby_llm-agents 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d8c4a83ecc9e39e7df7243b98a51d1c249a963f3a1f96551ebefae13becb50c5
4
- data.tar.gz: d042bda1737b7593187896e879b3065cc855e990e1406410e99d1d853819f3a9
3
+ metadata.gz: 463487c17c50bf1496a30c9eea51dab3c334a17010853da97da5624a6cf564b5
4
+ data.tar.gz: 470a6666266d17dc8190f5118eec0c5f674fb51bae8b368d8713ea65d1882025
5
5
  SHA512:
6
- metadata.gz: 78b6fa31a8a656c36e0bb51f6e7a405101e90f47afc3ca35f87c392878b6a2d986b2b275c6424826ffb6b5d4bfb059f4632e8976aa5b2a473eab540619bd18cb
7
- data.tar.gz: 3d5890ea864aea3531e96571b6010c9fdcedc5a15e8966c7974138cde0ebaec89a771e33fd20e3fd2d5097a1ecfc398236d833f9852f046fd3b1f720eaf7fb6e
6
+ metadata.gz: b1e2d4688dfc294c3b94c95df084a248fa25fbcd7d99910f31f72b85138bf37a392a8b350462ee341d5d209674b844a9d2692a177db30845857a586fa77ce3bc
7
+ data.tar.gz: 50130237011f12c808a073d55b9083ce8449f8e0ddf8dd800c13134104f48233ea4f0fac3ddcbcc9a8b45b91814f67db5c57632dd6d54930c1ffd19fda825e96
data/README.md CHANGED
@@ -135,7 +135,7 @@ result.save("logo.png")
135
135
  | **Attachments** | Images, PDFs, and multimodal support | [Attachments](https://github.com/adham90/ruby_llm-agents/wiki/Attachments) |
136
136
  | **Embeddings** | Vector embeddings with batching, caching, and preprocessing | [Embeddings](https://github.com/adham90/ruby_llm-agents/wiki/Embeddings) |
137
137
  | **Image Operations** | Generation, analysis, editing, pipelines with cost tracking | [Images](https://github.com/adham90/ruby_llm-agents/wiki/Image-Generation) |
138
- | **Audio** | Text-to-speech (OpenAI, ElevenLabs) and speech-to-text with cost tracking | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
138
+ | **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
139
139
  | **Alerts** | Slack, webhook, and custom notifications | [Alerts](https://github.com/adham90/ruby_llm-agents/wiki/Alerts) |
140
140
 
141
141
  ## Quick Start
@@ -0,0 +1,57 @@
1
+ <%
2
+ response = @execution.response || {}
3
+ audio_src = response["audio_url"] || response[:audio_url] ||
4
+ response["audio_data_uri"] || response[:audio_data_uri]
5
+ audio_format = response["format"] || response[:format] ||
6
+ @execution.metadata&.dig("audio_format")
7
+ audio_duration = response["duration"] || response[:duration] ||
8
+ @execution.metadata&.dig("audio_duration_seconds")
9
+ audio_file_size = response["file_size"] || response[:file_size] ||
10
+ @execution.metadata&.dig("audio_file_size_bytes")
11
+ audio_voice = response["voice_id"] || response[:voice_id] ||
12
+ @execution.metadata&.dig("voice_id")
13
+ audio_provider = response["provider"] || response[:provider] ||
14
+ @execution.metadata&.dig("audio_provider")
15
+ audio_characters = @execution.metadata&.dig("audio_characters")
16
+ %>
17
+
18
+ <div class="flex items-center gap-3 mt-6 mb-3">
19
+ <span class="text-[10px] font-medium text-gray-400 dark:text-gray-600 uppercase tracking-widest font-mono">audio</span>
20
+ <div class="flex-1 border-t border-gray-200 dark:border-gray-800"></div>
21
+ </div>
22
+
23
+ <% if audio_src.present? %>
24
+ <div class="mb-3">
25
+ <audio controls preload="metadata" class="w-full max-w-lg" style="height: 36px;">
26
+ <source src="<%= audio_src %>">
27
+ Your browser does not support the audio element.
28
+ </audio>
29
+ </div>
30
+ <% end %>
31
+
32
+ <div class="flex flex-wrap items-center gap-x-4 gap-y-1 font-mono text-xs text-gray-400 dark:text-gray-500">
33
+ <% if audio_duration.present? %>
34
+ <span><span class="text-gray-800 dark:text-gray-200"><%= audio_duration.is_a?(Numeric) ? "#{audio_duration.round(1)}s" : audio_duration %></span> duration</span>
35
+ <% end %>
36
+ <% if audio_format.present? %>
37
+ <span><span class="text-gray-800 dark:text-gray-200"><%= audio_format %></span> format</span>
38
+ <% end %>
39
+ <% if audio_file_size.present? %>
40
+ <span><span class="text-gray-800 dark:text-gray-200"><%= number_to_human_size(audio_file_size) %></span> size</span>
41
+ <% end %>
42
+ <% if audio_voice.present? %>
43
+ <span><span class="text-gray-800 dark:text-gray-200"><%= audio_voice %></span> voice</span>
44
+ <% end %>
45
+ <% if audio_provider.present? %>
46
+ <span><span class="text-gray-800 dark:text-gray-200"><%= audio_provider %></span> provider</span>
47
+ <% end %>
48
+ <% if audio_characters.present? %>
49
+ <span><span class="text-gray-800 dark:text-gray-200"><%= number_to_human_short(audio_characters) %></span> characters</span>
50
+ <% end %>
51
+ </div>
52
+
53
+ <% if audio_src.blank? %>
54
+ <p class="text-xs text-gray-400 dark:text-gray-600 font-mono mt-2 italic">
55
+ No audio data stored. Enable <code class="text-gray-500 dark:text-gray-400">persist_audio_data</code> in config to play back Speaker audio here.
56
+ </p>
57
+ <% end %>
@@ -57,6 +57,14 @@
57
57
  <% end %>
58
58
  </div>
59
59
 
60
+ <!-- ── audio player ──────────────────── -->
61
+ <% if @execution.agent_type.to_s.match?(/Speaker|Narrator|Transcriber/i) ||
62
+ @execution.metadata&.dig("audio_duration_seconds").present? ||
63
+ @execution.response&.dig("audio_data_uri").present? ||
64
+ @execution.response&.dig("audio_url").present? %>
65
+ <%= render "ruby_llm/agents/executions/audio_player" %>
66
+ <% end %>
67
+
60
68
  <!-- ── tokens ──────────────────────── -->
61
69
  <%
62
70
  input_tokens = @execution.input_tokens || 0
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Agents
5
+ class Speaker
6
+ # ActiveStorage integration for speakers
7
+ #
8
+ # Provides convenience methods for generating audio and directly
9
+ # attaching it to ActiveStorage attachments.
10
+ #
11
+ # @example Attaching to a model
12
+ # class Article < ApplicationRecord
13
+ # has_one_attached :narration
14
+ # end
15
+ #
16
+ # class ArticleNarrator < RubyLLM::Agents::Speaker
17
+ # include RubyLLM::Agents::Speaker::ActiveStorageSupport
18
+ #
19
+ # provider :openai
20
+ # model 'tts-1-hd'
21
+ # voice 'nova'
22
+ # end
23
+ #
24
+ # article = Article.find(1)
25
+ # result = ArticleNarrator.speak_and_attach(
26
+ # text: article.body,
27
+ # record: article,
28
+ # attachment_name: :narration
29
+ # )
30
+ #
31
+ module ActiveStorageSupport
32
+ extend ActiveSupport::Concern
33
+
34
+ class_methods do
35
+ # Generate audio and attach it to a record
36
+ #
37
+ # @param text [String] Text to convert to speech
38
+ # @param record [ActiveRecord::Base] The record to attach to
39
+ # @param attachment_name [Symbol] Name of the attachment (e.g., :narration)
40
+ # @param options [Hash] Additional options for generation
41
+ # @return [SpeechResult] The speech result with audio_url set
42
+ def speak_and_attach(text:, record:, attachment_name:, **options)
43
+ result = call(text: text, **options)
44
+
45
+ return result unless result.success?
46
+
47
+ attach_audio_to_record(result, record, attachment_name, options)
48
+
49
+ result
50
+ end
51
+
52
+ private
53
+
54
+ def attach_audio_to_record(result, record, attachment_name, options)
55
+ attachment = record.public_send(attachment_name)
56
+ filename = options[:filename] || generate_audio_filename(result)
57
+
58
+ attachment.attach(
59
+ io: StringIO.new(result.audio),
60
+ filename: filename,
61
+ content_type: result.content_type
62
+ )
63
+
64
+ result.audio_key = attachment.blob.key if attachment.respond_to?(:blob) && attachment.blob
65
+ result.audio_url = blob_url(attachment) if attachment.respond_to?(:blob) && attachment.blob
66
+ end
67
+
68
+ def blob_url(attachment)
69
+ if attachment.blob.respond_to?(:url)
70
+ attachment.blob.url
71
+ elsif attachment.blob.respond_to?(:service_url)
72
+ attachment.blob.service_url
73
+ end
74
+ rescue => _e
75
+ nil
76
+ end
77
+
78
+ def generate_audio_filename(result)
79
+ timestamp = Time.current.to_i
80
+ ext = result.format || :mp3
81
+ "speech_#{timestamp}.#{ext}"
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -336,6 +336,14 @@ module RubyLLM
336
336
  context.output_tokens = 0
337
337
  context.total_cost = calculate_cost(result)
338
338
 
339
+ # Store audio-specific metadata for execution tracking
340
+ context[:provider] = result[:provider].to_s
341
+ context[:voice_id] = (resolved_voice_id || resolved_voice).to_s
342
+ context[:characters] = result[:characters]
343
+ context[:output_format] = result[:format].to_s
344
+ context[:file_size] = result[:audio]&.bytesize
345
+ context[:audio_duration_seconds] = result[:duration] if result[:duration]
346
+
339
347
  # Build final result
340
348
  context.output = build_result(
341
349
  result,
@@ -559,3 +567,5 @@ module RubyLLM
559
567
  end
560
568
  end
561
569
  end
570
+
571
+ require_relative "speaker/active_storage_support"
@@ -318,6 +318,16 @@ module RubyLLM
318
318
  context.output_tokens = 0
319
319
  context.total_cost = calculate_cost(raw_result)
320
320
 
321
+ # Store transcription-specific metadata for execution tracking
322
+ context[:language] = resolved_language if resolved_language
323
+ context[:detected_language] = raw_result[:language] if raw_result[:language]
324
+ context[:audio_duration_seconds] = raw_result[:duration] if raw_result[:duration]
325
+ context[:audio_minutes] = (raw_result[:duration] / 60.0).round(4) if raw_result[:duration]
326
+ context[:output_format] = self.class.output_format.to_s
327
+ context[:timestamp_granularity] = self.class.include_timestamps.to_s
328
+ context[:segment_count] = raw_result[:segments]&.size if raw_result[:segments]
329
+ context[:word_count] = raw_result[:text]&.split(/\s+/)&.size if raw_result[:text]
330
+
321
331
  # Build final result
322
332
  context.output = build_result(
323
333
  raw_result,
@@ -452,7 +452,8 @@ module RubyLLM
452
452
  :root_directory,
453
453
  :root_namespace,
454
454
  :tool_result_max_length,
455
- :redaction
455
+ :redaction,
456
+ :persist_audio_data
456
457
 
457
458
  # Attributes with validation (readers only, custom setters below)
458
459
  attr_reader :default_temperature,
@@ -734,6 +735,9 @@ module RubyLLM
734
735
 
735
736
  # Redaction defaults (disabled by default)
736
737
  @redaction = nil
738
+
739
+ # Audio data persistence (disabled by default — base64 audio can be large)
740
+ @persist_audio_data = false
737
741
  end
738
742
 
739
743
  # Returns the configured cache store, falling back to Rails.cache
@@ -4,6 +4,6 @@ module RubyLLM
4
4
  module Agents
5
5
  # Current version of the RubyLLM::Agents gem
6
6
  # @return [String] Semantic version string
7
- VERSION = "3.2.0"
7
+ VERSION = "3.3.0"
8
8
  end
9
9
  end
@@ -280,6 +280,9 @@ module RubyLLM
280
280
  detail_data[:response] = serialize_response(context)
281
281
  end
282
282
 
283
+ # Persist audio data for Speaker executions
284
+ maybe_persist_audio_response(context, detail_data)
285
+
283
286
  has_data = detail_data.values.any? { |v| v.present? && v != {} && v != [] }
284
287
  return unless has_data
285
288
 
@@ -376,6 +379,10 @@ module RubyLLM
376
379
  if global_config.persist_responses && context.output.respond_to?(:content)
377
380
  detail_data[:response] = serialize_response(context)
378
381
  end
382
+
383
+ # Persist audio data for Speaker executions
384
+ maybe_persist_audio_response(context, detail_data)
385
+
379
386
  data[:_detail_data] = detail_data
380
387
 
381
388
  data
@@ -463,6 +470,48 @@ module RubyLLM
463
470
  nil
464
471
  end
465
472
 
473
+ # Persists audio response data for Speaker executions
474
+ #
475
+ # When persist_audio_data is enabled and the output is a SpeechResult with
476
+ # audio binary data, stores a base64 data URI in the response column.
477
+ # Always stores audio_url if present (lightweight, no binary).
478
+ #
479
+ # @param context [Context] The execution context
480
+ # @param detail_data [Hash] The detail data hash to modify
481
+ def maybe_persist_audio_response(context, detail_data)
482
+ return unless context.output.is_a?(RubyLLM::Agents::SpeechResult)
483
+
484
+ # Always persist audio_url if present (it's just a string, no binary)
485
+ if context.output.audio_url.present?
486
+ detail_data[:response] ||= {}
487
+ detail_data[:response][:audio_url] = context.output.audio_url
488
+ end
489
+
490
+ # Persist full audio data URI only when opted in
491
+ return unless global_config.respond_to?(:persist_audio_data) && global_config.persist_audio_data
492
+ return unless context.output.audio.present?
493
+
494
+ detail_data[:response] = serialize_audio_response(context.output)
495
+ rescue => e
496
+ error("Failed to persist audio response: #{e.message}")
497
+ end
498
+
499
+ # Serializes a SpeechResult into a hash for the response column
500
+ #
501
+ # @param result [SpeechResult] The speech result to serialize
502
+ # @return [Hash] Serialized audio response data
503
+ def serialize_audio_response(result)
504
+ {
505
+ audio_data_uri: result.to_data_uri,
506
+ audio_url: result.audio_url,
507
+ format: result.format.to_s,
508
+ duration: result.duration,
509
+ file_size: result.file_size,
510
+ voice_id: result.voice_id,
511
+ provider: result.provider.to_s
512
+ }.compact
513
+ end
514
+
466
515
  # Queues async logging via background job
467
516
  #
468
517
  # @param data [Hash] Execution data
@@ -29,17 +29,17 @@ module RubyLLM
29
29
  # @return [String, nil] Binary audio data
30
30
  attr_reader :audio
31
31
 
32
- # @!attribute [r] audio_url
32
+ # @!attribute [rw] audio_url
33
33
  # @return [String, nil] URL if audio was stored remotely
34
- attr_reader :audio_url
34
+ attr_accessor :audio_url
35
35
 
36
- # @!attribute [r] audio_key
36
+ # @!attribute [rw] audio_key
37
37
  # @return [String, nil] Storage key if stored
38
- attr_reader :audio_key
38
+ attr_accessor :audio_key
39
39
 
40
- # @!attribute [r] audio_path
40
+ # @!attribute [rw] audio_path
41
41
  # @return [String, nil] Local file path if saved
42
- attr_reader :audio_path
42
+ attr_accessor :audio_path
43
43
 
44
44
  # @!endgroup
45
45
 
@@ -308,7 +308,12 @@ module RubyLLM
308
308
  }
309
309
  end
310
310
 
311
- private
311
+ # Returns MIME type for the audio format
312
+ #
313
+ # @return [String] MIME type
314
+ def content_type
315
+ mime_type_for_format
316
+ end
312
317
 
313
318
  # Returns MIME type for the audio format
314
319
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_llm-agents
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0
4
+ version: 3.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - adham90
@@ -111,6 +111,7 @@ files:
111
111
  - app/views/ruby_llm/agents/dashboard/_action_center.html.erb
112
112
  - app/views/ruby_llm/agents/dashboard/_tenant_budget.html.erb
113
113
  - app/views/ruby_llm/agents/dashboard/index.html.erb
114
+ - app/views/ruby_llm/agents/executions/_audio_player.html.erb
114
115
  - app/views/ruby_llm/agents/executions/_execution.html.erb
115
116
  - app/views/ruby_llm/agents/executions/_filters.html.erb
116
117
  - app/views/ruby_llm/agents/executions/_list.html.erb
@@ -209,6 +210,7 @@ files:
209
210
  - lib/ruby_llm-agents.rb
210
211
  - lib/ruby_llm/agents.rb
211
212
  - lib/ruby_llm/agents/audio/speaker.rb
213
+ - lib/ruby_llm/agents/audio/speaker/active_storage_support.rb
212
214
  - lib/ruby_llm/agents/audio/speech_client.rb
213
215
  - lib/ruby_llm/agents/audio/speech_pricing.rb
214
216
  - lib/ruby_llm/agents/audio/transcriber.rb