ruby_llm-agents 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +189 -31
  3. data/app/controllers/ruby_llm/agents/agents_controller.rb +136 -16
  4. data/app/controllers/ruby_llm/agents/dashboard_controller.rb +29 -9
  5. data/app/controllers/ruby_llm/agents/workflows_controller.rb +355 -0
  6. data/app/helpers/ruby_llm/agents/application_helper.rb +25 -0
  7. data/app/models/ruby_llm/agents/execution.rb +3 -0
  8. data/app/models/ruby_llm/agents/tenant_budget.rb +58 -15
  9. data/app/services/ruby_llm/agents/agent_registry.rb +51 -12
  10. data/app/views/layouts/ruby_llm/agents/application.html.erb +2 -29
  11. data/app/views/ruby_llm/agents/agents/_agent.html.erb +13 -1
  12. data/app/views/ruby_llm/agents/agents/_config_agent.html.erb +235 -0
  13. data/app/views/ruby_llm/agents/agents/_config_embedder.html.erb +70 -0
  14. data/app/views/ruby_llm/agents/agents/_config_image_generator.html.erb +152 -0
  15. data/app/views/ruby_llm/agents/agents/_config_moderator.html.erb +63 -0
  16. data/app/views/ruby_llm/agents/agents/_config_speaker.html.erb +108 -0
  17. data/app/views/ruby_llm/agents/agents/_config_transcriber.html.erb +91 -0
  18. data/app/views/ruby_llm/agents/agents/_workflow.html.erb +1 -1
  19. data/app/views/ruby_llm/agents/agents/index.html.erb +74 -9
  20. data/app/views/ruby_llm/agents/agents/show.html.erb +18 -378
  21. data/app/views/ruby_llm/agents/dashboard/_agent_comparison.html.erb +269 -15
  22. data/app/views/ruby_llm/agents/executions/show.html.erb +16 -0
  23. data/app/views/ruby_llm/agents/shared/_agent_type_badge.html.erb +93 -0
  24. data/app/views/ruby_llm/agents/workflows/_step_performance.html.erb +236 -0
  25. data/app/views/ruby_llm/agents/workflows/_structure_parallel.html.erb +76 -0
  26. data/app/views/ruby_llm/agents/workflows/_structure_pipeline.html.erb +74 -0
  27. data/app/views/ruby_llm/agents/workflows/_structure_router.html.erb +108 -0
  28. data/app/views/ruby_llm/agents/workflows/show.html.erb +442 -0
  29. data/config/routes.rb +1 -0
  30. data/lib/generators/ruby_llm_agents/agent_generator.rb +56 -7
  31. data/lib/generators/ruby_llm_agents/background_remover_generator.rb +110 -0
  32. data/lib/generators/ruby_llm_agents/embedder_generator.rb +107 -0
  33. data/lib/generators/ruby_llm_agents/image_analyzer_generator.rb +115 -0
  34. data/lib/generators/ruby_llm_agents/image_editor_generator.rb +108 -0
  35. data/lib/generators/ruby_llm_agents/image_generator_generator.rb +116 -0
  36. data/lib/generators/ruby_llm_agents/image_pipeline_generator.rb +178 -0
  37. data/lib/generators/ruby_llm_agents/image_transformer_generator.rb +109 -0
  38. data/lib/generators/ruby_llm_agents/image_upscaler_generator.rb +103 -0
  39. data/lib/generators/ruby_llm_agents/image_variator_generator.rb +102 -0
  40. data/lib/generators/ruby_llm_agents/install_generator.rb +76 -4
  41. data/lib/generators/ruby_llm_agents/restructure_generator.rb +292 -0
  42. data/lib/generators/ruby_llm_agents/speaker_generator.rb +121 -0
  43. data/lib/generators/ruby_llm_agents/templates/add_execution_type_migration.rb.tt +8 -0
  44. data/lib/generators/ruby_llm_agents/templates/agent.rb.tt +99 -84
  45. data/lib/generators/ruby_llm_agents/templates/application_agent.rb.tt +42 -40
  46. data/lib/generators/ruby_llm_agents/templates/application_background_remover.rb.tt +26 -0
  47. data/lib/generators/ruby_llm_agents/templates/application_embedder.rb.tt +50 -0
  48. data/lib/generators/ruby_llm_agents/templates/application_image_analyzer.rb.tt +26 -0
  49. data/lib/generators/ruby_llm_agents/templates/application_image_editor.rb.tt +20 -0
  50. data/lib/generators/ruby_llm_agents/templates/application_image_generator.rb.tt +38 -0
  51. data/lib/generators/ruby_llm_agents/templates/application_image_pipeline.rb.tt +139 -0
  52. data/lib/generators/ruby_llm_agents/templates/application_image_transformer.rb.tt +21 -0
  53. data/lib/generators/ruby_llm_agents/templates/application_image_upscaler.rb.tt +20 -0
  54. data/lib/generators/ruby_llm_agents/templates/application_image_variator.rb.tt +20 -0
  55. data/lib/generators/ruby_llm_agents/templates/application_speaker.rb.tt +49 -0
  56. data/lib/generators/ruby_llm_agents/templates/application_transcriber.rb.tt +53 -0
  57. data/lib/generators/ruby_llm_agents/templates/background_remover.rb.tt +44 -0
  58. data/lib/generators/ruby_llm_agents/templates/embedder.rb.tt +41 -0
  59. data/lib/generators/ruby_llm_agents/templates/image_analyzer.rb.tt +45 -0
  60. data/lib/generators/ruby_llm_agents/templates/image_editor.rb.tt +35 -0
  61. data/lib/generators/ruby_llm_agents/templates/image_generator.rb.tt +47 -0
  62. data/lib/generators/ruby_llm_agents/templates/image_pipeline.rb.tt +50 -0
  63. data/lib/generators/ruby_llm_agents/templates/image_transformer.rb.tt +44 -0
  64. data/lib/generators/ruby_llm_agents/templates/image_upscaler.rb.tt +38 -0
  65. data/lib/generators/ruby_llm_agents/templates/image_variator.rb.tt +33 -0
  66. data/lib/generators/ruby_llm_agents/templates/skills/AGENTS.md.tt +228 -0
  67. data/lib/generators/ruby_llm_agents/templates/skills/BACKGROUND_REMOVERS.md.tt +131 -0
  68. data/lib/generators/ruby_llm_agents/templates/skills/EMBEDDERS.md.tt +255 -0
  69. data/lib/generators/ruby_llm_agents/templates/skills/IMAGE_ANALYZERS.md.tt +120 -0
  70. data/lib/generators/ruby_llm_agents/templates/skills/IMAGE_EDITORS.md.tt +102 -0
  71. data/lib/generators/ruby_llm_agents/templates/skills/IMAGE_GENERATORS.md.tt +282 -0
  72. data/lib/generators/ruby_llm_agents/templates/skills/IMAGE_PIPELINES.md.tt +228 -0
  73. data/lib/generators/ruby_llm_agents/templates/skills/IMAGE_TRANSFORMERS.md.tt +120 -0
  74. data/lib/generators/ruby_llm_agents/templates/skills/IMAGE_UPSCALERS.md.tt +110 -0
  75. data/lib/generators/ruby_llm_agents/templates/skills/IMAGE_VARIATORS.md.tt +120 -0
  76. data/lib/generators/ruby_llm_agents/templates/skills/SPEAKERS.md.tt +212 -0
  77. data/lib/generators/ruby_llm_agents/templates/skills/TOOLS.md.tt +227 -0
  78. data/lib/generators/ruby_llm_agents/templates/skills/TRANSCRIBERS.md.tt +251 -0
  79. data/lib/generators/ruby_llm_agents/templates/skills/WORKFLOWS.md.tt +300 -0
  80. data/lib/generators/ruby_llm_agents/templates/speaker.rb.tt +56 -0
  81. data/lib/generators/ruby_llm_agents/templates/transcriber.rb.tt +51 -0
  82. data/lib/generators/ruby_llm_agents/transcriber_generator.rb +107 -0
  83. data/lib/generators/ruby_llm_agents/upgrade_generator.rb +152 -1
  84. data/lib/ruby_llm/agents/audio/speaker.rb +553 -0
  85. data/lib/ruby_llm/agents/audio/transcriber.rb +669 -0
  86. data/lib/ruby_llm/agents/base_agent.rb +675 -0
  87. data/lib/ruby_llm/agents/core/base/moderation_dsl.rb +181 -0
  88. data/lib/ruby_llm/agents/core/base/moderation_execution.rb +274 -0
  89. data/lib/ruby_llm/agents/core/base.rb +135 -0
  90. data/lib/ruby_llm/agents/core/configuration.rb +981 -0
  91. data/lib/ruby_llm/agents/core/errors.rb +150 -0
  92. data/lib/ruby_llm/agents/{instrumentation.rb → core/instrumentation.rb} +22 -1
  93. data/lib/ruby_llm/agents/core/llm_tenant.rb +358 -0
  94. data/lib/ruby_llm/agents/{version.rb → core/version.rb} +1 -1
  95. data/lib/ruby_llm/agents/dsl/base.rb +110 -0
  96. data/lib/ruby_llm/agents/dsl/caching.rb +142 -0
  97. data/lib/ruby_llm/agents/dsl/reliability.rb +307 -0
  98. data/lib/ruby_llm/agents/dsl.rb +41 -0
  99. data/lib/ruby_llm/agents/image/analyzer/dsl.rb +130 -0
  100. data/lib/ruby_llm/agents/image/analyzer/execution.rb +402 -0
  101. data/lib/ruby_llm/agents/image/analyzer.rb +90 -0
  102. data/lib/ruby_llm/agents/image/background_remover/dsl.rb +154 -0
  103. data/lib/ruby_llm/agents/image/background_remover/execution.rb +240 -0
  104. data/lib/ruby_llm/agents/image/background_remover.rb +89 -0
  105. data/lib/ruby_llm/agents/image/concerns/image_operation_dsl.rb +91 -0
  106. data/lib/ruby_llm/agents/image/concerns/image_operation_execution.rb +165 -0
  107. data/lib/ruby_llm/agents/image/editor/dsl.rb +56 -0
  108. data/lib/ruby_llm/agents/image/editor/execution.rb +207 -0
  109. data/lib/ruby_llm/agents/image/editor.rb +92 -0
  110. data/lib/ruby_llm/agents/image/generator/active_storage_support.rb +127 -0
  111. data/lib/ruby_llm/agents/image/generator/content_policy.rb +95 -0
  112. data/lib/ruby_llm/agents/image/generator/pricing.rb +353 -0
  113. data/lib/ruby_llm/agents/image/generator/templates.rb +124 -0
  114. data/lib/ruby_llm/agents/image/generator.rb +455 -0
  115. data/lib/ruby_llm/agents/image/pipeline/dsl.rb +213 -0
  116. data/lib/ruby_llm/agents/image/pipeline/execution.rb +382 -0
  117. data/lib/ruby_llm/agents/image/pipeline.rb +97 -0
  118. data/lib/ruby_llm/agents/image/transformer/dsl.rb +148 -0
  119. data/lib/ruby_llm/agents/image/transformer/execution.rb +223 -0
  120. data/lib/ruby_llm/agents/image/transformer.rb +95 -0
  121. data/lib/ruby_llm/agents/image/upscaler/dsl.rb +83 -0
  122. data/lib/ruby_llm/agents/image/upscaler/execution.rb +219 -0
  123. data/lib/ruby_llm/agents/image/upscaler.rb +81 -0
  124. data/lib/ruby_llm/agents/image/variator/dsl.rb +62 -0
  125. data/lib/ruby_llm/agents/image/variator/execution.rb +189 -0
  126. data/lib/ruby_llm/agents/image/variator.rb +80 -0
  127. data/lib/ruby_llm/agents/{alert_manager.rb → infrastructure/alert_manager.rb} +17 -22
  128. data/lib/ruby_llm/agents/infrastructure/budget/budget_query.rb +145 -0
  129. data/lib/ruby_llm/agents/infrastructure/budget/config_resolver.rb +149 -0
  130. data/lib/ruby_llm/agents/infrastructure/budget/forecaster.rb +68 -0
  131. data/lib/ruby_llm/agents/infrastructure/budget/spend_recorder.rb +279 -0
  132. data/lib/ruby_llm/agents/infrastructure/budget_tracker.rb +275 -0
  133. data/lib/ruby_llm/agents/{execution_logger_job.rb → infrastructure/execution_logger_job.rb} +17 -1
  134. data/lib/ruby_llm/agents/{reliability → infrastructure/reliability}/executor.rb +2 -1
  135. data/lib/ruby_llm/agents/{reliability → infrastructure/reliability}/retry_strategy.rb +9 -3
  136. data/lib/ruby_llm/agents/{reliability.rb → infrastructure/reliability.rb} +11 -21
  137. data/lib/ruby_llm/agents/pipeline/builder.rb +215 -0
  138. data/lib/ruby_llm/agents/pipeline/context.rb +255 -0
  139. data/lib/ruby_llm/agents/pipeline/executor.rb +86 -0
  140. data/lib/ruby_llm/agents/pipeline/middleware/base.rb +124 -0
  141. data/lib/ruby_llm/agents/pipeline/middleware/budget.rb +95 -0
  142. data/lib/ruby_llm/agents/pipeline/middleware/cache.rb +171 -0
  143. data/lib/ruby_llm/agents/pipeline/middleware/instrumentation.rb +415 -0
  144. data/lib/ruby_llm/agents/pipeline/middleware/reliability.rb +276 -0
  145. data/lib/ruby_llm/agents/pipeline/middleware/tenant.rb +196 -0
  146. data/lib/ruby_llm/agents/pipeline.rb +68 -0
  147. data/lib/ruby_llm/agents/{engine.rb → rails/engine.rb} +79 -11
  148. data/lib/ruby_llm/agents/results/background_removal_result.rb +286 -0
  149. data/lib/ruby_llm/agents/{result.rb → results/base.rb} +73 -1
  150. data/lib/ruby_llm/agents/results/embedding_result.rb +243 -0
  151. data/lib/ruby_llm/agents/results/image_analysis_result.rb +314 -0
  152. data/lib/ruby_llm/agents/results/image_edit_result.rb +250 -0
  153. data/lib/ruby_llm/agents/results/image_generation_result.rb +346 -0
  154. data/lib/ruby_llm/agents/results/image_pipeline_result.rb +399 -0
  155. data/lib/ruby_llm/agents/results/image_transform_result.rb +251 -0
  156. data/lib/ruby_llm/agents/results/image_upscale_result.rb +255 -0
  157. data/lib/ruby_llm/agents/results/image_variation_result.rb +237 -0
  158. data/lib/ruby_llm/agents/results/moderation_result.rb +158 -0
  159. data/lib/ruby_llm/agents/results/speech_result.rb +338 -0
  160. data/lib/ruby_llm/agents/results/transcription_result.rb +408 -0
  161. data/lib/ruby_llm/agents/text/embedder.rb +444 -0
  162. data/lib/ruby_llm/agents/text/moderator.rb +237 -0
  163. data/lib/ruby_llm/agents/workflow/async.rb +220 -0
  164. data/lib/ruby_llm/agents/workflow/async_executor.rb +156 -0
  165. data/lib/ruby_llm/agents/{workflow.rb → workflow/orchestrator.rb} +6 -5
  166. data/lib/ruby_llm/agents/workflow/parallel.rb +34 -17
  167. data/lib/ruby_llm/agents/workflow/thread_pool.rb +185 -0
  168. data/lib/ruby_llm/agents.rb +86 -20
  169. metadata +172 -34
  170. data/lib/ruby_llm/agents/base/caching.rb +0 -40
  171. data/lib/ruby_llm/agents/base/cost_calculation.rb +0 -105
  172. data/lib/ruby_llm/agents/base/dsl.rb +0 -324
  173. data/lib/ruby_llm/agents/base/execution.rb +0 -366
  174. data/lib/ruby_llm/agents/base/reliability_dsl.rb +0 -82
  175. data/lib/ruby_llm/agents/base/reliability_execution.rb +0 -136
  176. data/lib/ruby_llm/agents/base/response_building.rb +0 -86
  177. data/lib/ruby_llm/agents/base/tool_tracking.rb +0 -57
  178. data/lib/ruby_llm/agents/base.rb +0 -210
  179. data/lib/ruby_llm/agents/budget_tracker.rb +0 -733
  180. data/lib/ruby_llm/agents/configuration.rb +0 -394
  181. /data/lib/ruby_llm/agents/{deprecations.rb → core/deprecations.rb} +0 -0
  182. /data/lib/ruby_llm/agents/{inflections.rb → core/inflections.rb} +0 -0
  183. /data/lib/ruby_llm/agents/{resolved_config.rb → core/resolved_config.rb} +0 -0
  184. /data/lib/ruby_llm/agents/{attempt_tracker.rb → infrastructure/attempt_tracker.rb} +0 -0
  185. /data/lib/ruby_llm/agents/{cache_helper.rb → infrastructure/cache_helper.rb} +0 -0
  186. /data/lib/ruby_llm/agents/{circuit_breaker.rb → infrastructure/circuit_breaker.rb} +0 -0
  187. /data/lib/ruby_llm/agents/{redactor.rb → infrastructure/redactor.rb} +0 -0
  188. /data/lib/ruby_llm/agents/{reliability → infrastructure/reliability}/breaker_manager.rb +0 -0
  189. /data/lib/ruby_llm/agents/{reliability → infrastructure/reliability}/execution_constraints.rb +0 -0
  190. /data/lib/ruby_llm/agents/{reliability → infrastructure/reliability}/fallback_routing.rb +0 -0
@@ -0,0 +1,669 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require_relative "../results/transcription_result"
5
+
6
+ module RubyLLM
7
+ module Agents
8
+ # Base class for creating audio transcribers using the middleware pipeline
9
+ #
10
+ # Transcriber provides a DSL for configuring audio-to-text operations with
11
+ # built-in execution tracking, budget controls, and multi-tenancy support
12
+ # through the middleware pipeline.
13
+ #
14
+ # @example Basic usage
15
+ # class MeetingTranscriber < RubyLLM::Agents::Transcriber
16
+ # model 'whisper-1'
17
+ # end
18
+ #
19
+ # result = MeetingTranscriber.call(audio: "meeting.mp3")
20
+ # result.text # => "Hello everyone, welcome to the meeting..."
21
+ #
22
+ # @example With language specification
23
+ # class SpanishTranscriber < RubyLLM::Agents::Transcriber
24
+ # model 'gpt-4o-transcribe'
25
+ # language 'es'
26
+ #
27
+ # def prompt
28
+ # "Podcast sobre tecnología y programación"
29
+ # end
30
+ # end
31
+ #
32
+ # @example With subtitle output
33
+ # class SubtitleGenerator < RubyLLM::Agents::Transcriber
34
+ # model 'whisper-1'
35
+ # output_format :srt
36
+ # include_timestamps :segment
37
+ # end
38
+ #
39
+ # result = SubtitleGenerator.call(audio: "video.mp4")
40
+ # result.srt # => "1\n00:00:00,000 --> 00:00:02,500\nHello\n\n..."
41
+ #
42
+ # @api public
43
+ class Transcriber < BaseAgent
44
+ class << self
45
+ # Returns the agent type for transcribers
46
+ #
47
+ # @return [Symbol] :audio
48
+ def agent_type
49
+ :audio
50
+ end
51
+
52
+ # @!group Transcriber-specific DSL
53
+
54
+ # Sets or returns the transcription model
55
+ #
56
+ # @param value [String, nil] The model identifier
57
+ # @return [String] The current model setting
58
+ def model(value = nil)
59
+ @model = value if value
60
+ return @model if defined?(@model) && @model
61
+
62
+ if superclass.respond_to?(:agent_type) && superclass.agent_type == :audio
63
+ superclass.model
64
+ else
65
+ default_transcription_model
66
+ end
67
+ end
68
+
69
+ # Sets or returns the language for transcription
70
+ #
71
+ # @param value [String, nil] ISO 639-1 language code
72
+ # @return [String, nil] The current language setting
73
+ def language(value = nil)
74
+ @language = value if value
75
+ @language || inherited_or_default(:language, nil)
76
+ end
77
+
78
+ # Sets or returns the output format for transcription
79
+ #
80
+ # @param value [Symbol, nil] Output format (:text, :json, :srt, :vtt, :verbose_json)
81
+ # @return [Symbol] The current output format
82
+ def output_format(value = nil)
83
+ @output_format = value if value
84
+ @output_format || inherited_or_default(:output_format, :text)
85
+ end
86
+
87
+ # Sets or returns whether to include timestamps
88
+ #
89
+ # @param value [Symbol, nil] Timestamp level (:none, :segment, :word)
90
+ # @return [Symbol] The current timestamp setting
91
+ def include_timestamps(value = nil)
92
+ @include_timestamps = value if value
93
+ @include_timestamps || inherited_or_default(:include_timestamps, :segment)
94
+ end
95
+
96
+ # @!endgroup
97
+
98
+ # @!group Chunking DSL
99
+
100
+ # Configures chunking for long audio files
101
+ #
102
+ # @yield Block for configuring chunking options
103
+ # @return [ChunkingConfig] The chunking configuration
104
+ def chunking(&block)
105
+ @chunking_config ||= ChunkingConfig.new
106
+ @chunking_config.instance_eval(&block) if block_given?
107
+ @chunking_config
108
+ end
109
+
110
+ # Returns chunking configuration
111
+ #
112
+ # @return [ChunkingConfig, nil] The chunking configuration
113
+ def chunking_config
114
+ @chunking_config || inherited_or_default(:chunking_config, nil)
115
+ end
116
+
117
+ # @!endgroup
118
+
119
+ # @!group Reliability DSL
120
+
121
+ # Configures reliability options (retries, fallbacks)
122
+ #
123
+ # @yield Block for configuring reliability options
124
+ # @return [ReliabilityConfig] The reliability configuration
125
+ def reliability(&block)
126
+ @reliability_config ||= ReliabilityConfig.new
127
+ @reliability_config.instance_eval(&block) if block_given?
128
+ @reliability_config
129
+ end
130
+
131
+ # Returns reliability configuration
132
+ #
133
+ # @return [ReliabilityConfig, nil] The reliability configuration
134
+ def reliability_config
135
+ @reliability_config || inherited_or_default(:reliability_config, nil)
136
+ end
137
+
138
+ # Sets fallback models directly (shorthand for reliability block)
139
+ #
140
+ # @param models [Array<String>] Model identifiers to try on failure
141
+ # @return [Array<String>] The fallback models
142
+ def fallback_models(*models)
143
+ if models.any?
144
+ @fallback_models = models.flatten
145
+ end
146
+ @fallback_models || inherited_or_default(:fallback_models, [])
147
+ end
148
+
149
+ # @!endgroup
150
+
151
+ # Factory method to instantiate and execute transcription
152
+ #
153
+ # @param audio [String, File, IO] Audio file path, URL, File object, or binary data
154
+ # @param format [Symbol, nil] Audio format hint when passing binary data
155
+ # @param options [Hash] Additional options
156
+ # @return [TranscriptionResult] The transcription result
157
+ def call(audio:, format: nil, **options)
158
+ new(audio: audio, format: format, **options).call
159
+ end
160
+
161
+ private
162
+
163
+ def inherited_or_default(method, default)
164
+ superclass.respond_to?(method) ? superclass.send(method) : default
165
+ end
166
+
167
+ def default_transcription_model
168
+ RubyLLM::Agents.configuration.default_transcription_model
169
+ rescue StandardError
170
+ "whisper-1"
171
+ end
172
+ end
173
+
174
+ # Configuration class for chunking options
175
+ class ChunkingConfig
176
+ attr_accessor :enabled, :max_duration, :overlap, :parallel
177
+
178
+ def initialize
179
+ @enabled = false
180
+ @max_duration = 600 # 10 minutes
181
+ @overlap = 5 # 5 seconds
182
+ @parallel = false
183
+ end
184
+
185
+ def enabled?
186
+ @enabled
187
+ end
188
+
189
+ def to_h
190
+ {
191
+ enabled: enabled,
192
+ max_duration: max_duration,
193
+ overlap: overlap,
194
+ parallel: parallel
195
+ }
196
+ end
197
+ end
198
+
199
+ # Configuration class for reliability options
200
+ class ReliabilityConfig
201
+ attr_accessor :max_retries, :backoff, :fallback_models_list, :total_timeout_seconds
202
+
203
+ def initialize
204
+ @max_retries = 3
205
+ @backoff = :exponential
206
+ @fallback_models_list = []
207
+ @total_timeout_seconds = nil
208
+ end
209
+
210
+ def retries(max: 3, backoff: :exponential)
211
+ @max_retries = max
212
+ @backoff = backoff
213
+ end
214
+
215
+ def fallback_models(*models)
216
+ @fallback_models_list = models.flatten
217
+ end
218
+
219
+ def total_timeout(seconds)
220
+ @total_timeout_seconds = seconds
221
+ end
222
+
223
+ def to_h
224
+ {
225
+ max_retries: max_retries,
226
+ backoff: backoff,
227
+ fallback_models: fallback_models_list,
228
+ total_timeout: total_timeout_seconds
229
+ }
230
+ end
231
+ end
232
+
233
+ # @!attribute [r] audio
234
+ # @return [String, File, IO] Audio input
235
+ # @!attribute [r] audio_format
236
+ # @return [Symbol, nil] Audio format hint
237
+ attr_reader :audio, :audio_format
238
+
239
+ # Creates a new Transcriber instance
240
+ #
241
+ # @param audio [String, File, IO] Audio file path, URL, File object, or binary data
242
+ # @param format [Symbol, nil] Audio format hint when passing binary data
243
+ # @param options [Hash] Configuration options
244
+ def initialize(audio:, format: nil, **options)
245
+ @audio = audio
246
+ @audio_format = format
247
+ @runtime_language = options.delete(:language)
248
+
249
+ # Set model to transcription model if not specified
250
+ options[:model] ||= self.class.model
251
+
252
+ super(**options)
253
+ end
254
+
255
+ # Executes the transcription through the middleware pipeline
256
+ #
257
+ # @return [TranscriptionResult] The transcription result
258
+ def call
259
+ context = build_context
260
+ result_context = Pipeline::Executor.execute(context)
261
+ result_context.output
262
+ end
263
+
264
+ # The input for this transcription operation
265
+ #
266
+ # @return [String] Description of the audio input
267
+ def user_prompt
268
+ case @audio
269
+ when String
270
+ @audio.start_with?("http") ? "Audio URL: #{@audio}" : "Audio file: #{@audio}"
271
+ else
272
+ "Audio data"
273
+ end
274
+ end
275
+
276
+ # Returns the prompt for transcription context
277
+ #
278
+ # Override this in subclasses to provide context hints that
279
+ # improve transcription accuracy.
280
+ #
281
+ # @return [String, nil] The context prompt
282
+ def prompt
283
+ nil
284
+ end
285
+
286
+ # Post-processes text after transcription
287
+ #
288
+ # Override this in subclasses to apply custom post-processing.
289
+ #
290
+ # @param text [String] The transcribed text
291
+ # @return [String] The processed text
292
+ def postprocess_text(text)
293
+ text
294
+ end
295
+
296
+ # Core transcription execution
297
+ #
298
+ # This is called by the Pipeline::Executor after middleware
299
+ # has been applied. Only contains the transcription API logic.
300
+ #
301
+ # @param context [Pipeline::Context] The execution context
302
+ # @return [void] Sets context.output with the TranscriptionResult
303
+ def execute(context)
304
+ execution_started_at = Time.current
305
+
306
+ # Normalize and validate input
307
+ audio_input = normalize_audio_input(@audio, @audio_format)
308
+ validate_audio_input!(audio_input)
309
+
310
+ # Execute transcription with reliability (retries, fallbacks)
311
+ raw_result = execute_with_reliability(audio_input)
312
+
313
+ execution_completed_at = Time.current
314
+ duration_ms = ((execution_completed_at - execution_started_at) * 1000).to_i
315
+
316
+ # Update context
317
+ context.input_tokens = 0 # Audio uses duration, not tokens
318
+ context.output_tokens = 0
319
+ context.total_cost = calculate_cost(raw_result)
320
+
321
+ # Build final result
322
+ context.output = build_result(
323
+ raw_result,
324
+ started_at: context.started_at || execution_started_at,
325
+ completed_at: execution_completed_at,
326
+ duration_ms: duration_ms,
327
+ tenant_id: context.tenant_id
328
+ )
329
+ end
330
+
331
+ # Generates the cache key for this transcription
332
+ #
333
+ # @return [String] Cache key
334
+ def agent_cache_key
335
+ # Generate content hash based on input type
336
+ content_hash = case @audio
337
+ when String
338
+ if @audio.start_with?("http://", "https://")
339
+ Digest::SHA256.hexdigest(@audio)
340
+ elsif File.exist?(@audio)
341
+ Digest::SHA256.file(@audio).hexdigest
342
+ else
343
+ Digest::SHA256.hexdigest(@audio)
344
+ end
345
+ when File, IO
346
+ @audio.rewind if @audio.respond_to?(:rewind)
347
+ Digest::SHA256.hexdigest(@audio.read).tap do
348
+ @audio.rewind if @audio.respond_to?(:rewind)
349
+ end
350
+ else
351
+ Digest::SHA256.hexdigest(@audio.to_s)
352
+ end
353
+
354
+ components = [
355
+ "ruby_llm_agents",
356
+ "transcription",
357
+ self.class.name,
358
+ self.class.version,
359
+ resolved_model,
360
+ resolved_language,
361
+ self.class.output_format,
362
+ content_hash
363
+ ].compact
364
+
365
+ components.join("/")
366
+ end
367
+
368
+ private
369
+
370
+ # Builds context for pipeline execution
371
+ #
372
+ # @return [Pipeline::Context] The context object
373
+ def build_context
374
+ Pipeline::Context.new(
375
+ input: user_prompt,
376
+ agent_class: self.class,
377
+ agent_instance: self,
378
+ model: resolved_model,
379
+ tenant: @options[:tenant],
380
+ skip_cache: @options[:skip_cache]
381
+ )
382
+ end
383
+
384
+ # Normalizes audio input to a consistent format
385
+ #
386
+ # @param audio [String, File, IO] Audio input
387
+ # @param format [Symbol, nil] Format hint
388
+ # @return [Hash] Normalized audio input with :source and :type
389
+ def normalize_audio_input(audio, format)
390
+ case audio
391
+ when String
392
+ if audio.start_with?("http://", "https://")
393
+ { source: audio, type: :url }
394
+ elsif looks_like_file_path?(audio)
395
+ { source: audio, type: :file_path }
396
+ else
397
+ # Assume it's binary data
398
+ { source: audio, type: :binary, format: format }
399
+ end
400
+ when File, IO
401
+ { source: audio, type: :file_object }
402
+ else
403
+ raise ArgumentError, "audio must be a file path, URL, File object, or binary data"
404
+ end
405
+ end
406
+
407
+ # Determines if a string looks like a file path
408
+ #
409
+ # @param str [String] String to check
410
+ # @return [Boolean] True if it looks like a file path
411
+ def looks_like_file_path?(str)
412
+ # Check if it has path separators or common audio extensions
413
+ return true if str.include?("/") || str.include?("\\")
414
+ return true if str.match?(/\.(mp3|wav|ogg|flac|m4a|aac|webm|mp4|mpeg)$/i)
415
+
416
+ # Otherwise check if file actually exists
417
+ File.exist?(str)
418
+ end
419
+
420
+ # Validates audio input
421
+ #
422
+ # @param audio_input [Hash] Normalized audio input
423
+ # @raise [ArgumentError] If input is invalid
424
+ def validate_audio_input!(audio_input)
425
+ case audio_input[:type]
426
+ when :file_path
427
+ unless File.exist?(audio_input[:source])
428
+ raise ArgumentError, "Audio file not found: #{audio_input[:source]}"
429
+ end
430
+ when :url
431
+ unless audio_input[:source].match?(%r{\Ahttps?://}i)
432
+ raise ArgumentError, "Invalid audio URL: #{audio_input[:source]}"
433
+ end
434
+ when :binary
435
+ if audio_input[:source].nil? || audio_input[:source].empty?
436
+ raise ArgumentError, "Binary audio data cannot be empty"
437
+ end
438
+ end
439
+ end
440
+
441
+ # Executes transcription with reliability features
442
+ #
443
+ # @param audio_input [Hash] Normalized audio input
444
+ # @return [Hash] Raw transcription result
445
+ def execute_with_reliability(audio_input)
446
+ models_to_try = [resolved_model] + self.class.fallback_models
447
+ last_error = nil
448
+
449
+ models_to_try.each do |model|
450
+ retries = 0
451
+ max_retries = reliability_max_retries
452
+
453
+ begin
454
+ return execute_transcription(audio_input, model)
455
+ rescue StandardError => e
456
+ last_error = e
457
+ retries += 1
458
+
459
+ if retryable_error?(e) && retries < max_retries
460
+ sleep(calculate_backoff(retries))
461
+ retry
462
+ end
463
+
464
+ # Try next model
465
+ next
466
+ end
467
+ end
468
+
469
+ raise last_error || StandardError.new("All transcription models exhausted")
470
+ end
471
+
472
+ # Executes the actual transcription API call
473
+ #
474
+ # @param audio_input [Hash] Normalized audio input
475
+ # @param model [String] Model to use
476
+ # @return [Hash] Raw transcription result
477
+ def execute_transcription(audio_input, model)
478
+ transcribe_options = build_transcribe_options(model)
479
+
480
+ # Get audio source for API call
481
+ audio_source = resolve_audio_source(audio_input)
482
+
483
+ # Call RubyLLM's transcribe method
484
+ response = RubyLLM.transcribe(audio_source, **transcribe_options)
485
+
486
+ {
487
+ text: response.text,
488
+ segments: extract_segments(response),
489
+ words: extract_words(response),
490
+ language: response.respond_to?(:language) ? response.language : nil,
491
+ duration: response.respond_to?(:duration) ? response.duration : nil,
492
+ model: model,
493
+ raw_response: response
494
+ }
495
+ end
496
+
497
+ # Builds options for RubyLLM.transcribe
498
+ #
499
+ # @param model [String] Model to use
500
+ # @return [Hash] Options for transcription
501
+ def build_transcribe_options(model)
502
+ options = { model: model }
503
+
504
+ # Add language if specified
505
+ lang = resolved_language
506
+ options[:language] = lang if lang
507
+
508
+ # Add prompt if specified
509
+ prompt_text = prompt
510
+ options[:prompt] = prompt_text if prompt_text
511
+
512
+ # Add format-specific options
513
+ case self.class.output_format
514
+ when :verbose_json
515
+ options[:response_format] = "verbose_json"
516
+ when :srt
517
+ options[:response_format] = "srt"
518
+ when :vtt
519
+ options[:response_format] = "vtt"
520
+ end
521
+
522
+ # Add timestamp granularity
523
+ case self.class.include_timestamps
524
+ when :word
525
+ options[:timestamp_granularities] = ["word", "segment"]
526
+ when :segment
527
+ options[:timestamp_granularities] = ["segment"]
528
+ end
529
+
530
+ options
531
+ end
532
+
533
+ # Resolves audio source for API call
534
+ #
535
+ # @param audio_input [Hash] Normalized audio input
536
+ # @return [String, File] Audio source for API
537
+ def resolve_audio_source(audio_input)
538
+ case audio_input[:type]
539
+ when :file_path, :file_object, :url, :binary
540
+ audio_input[:source]
541
+ end
542
+ end
543
+
544
+ # Extracts segments from transcription response
545
+ #
546
+ # @param response [Object] Transcription response
547
+ # @return [Array<Hash>, nil] Segments array
548
+ def extract_segments(response)
549
+ return nil unless response.respond_to?(:segments)
550
+
551
+ segments = response.segments
552
+ return nil unless segments.is_a?(Array)
553
+
554
+ segments.map do |seg|
555
+ {
556
+ start: seg[:start] || seg["start"],
557
+ end: seg[:end] || seg["end"],
558
+ text: seg[:text] || seg["text"],
559
+ speaker: seg[:speaker] || seg["speaker"]
560
+ }
561
+ end
562
+ end
563
+
564
+ # Extracts words from transcription response
565
+ #
566
+ # @param response [Object] Transcription response
567
+ # @return [Array<Hash>, nil] Words array
568
+ def extract_words(response)
569
+ return nil unless response.respond_to?(:words)
570
+
571
+ words = response.words
572
+ return nil unless words.is_a?(Array)
573
+
574
+ words.map do |word|
575
+ {
576
+ start: word[:start] || word["start"],
577
+ end: word[:end] || word["end"],
578
+ word: word[:word] || word["word"]
579
+ }
580
+ end
581
+ end
582
+
583
+ # Builds the final result object
584
+ def build_result(raw_result, started_at:, completed_at:, duration_ms:, tenant_id:)
585
+ # Apply post-processing
586
+ text = raw_result[:text] ? postprocess_text(raw_result[:text]) : nil
587
+
588
+ TranscriptionResult.new(
589
+ text: text,
590
+ segments: raw_result[:segments],
591
+ words: raw_result[:words],
592
+ language: resolved_language,
593
+ detected_language: raw_result[:language],
594
+ audio_duration: raw_result[:duration],
595
+ model_id: raw_result[:model],
596
+ duration_ms: duration_ms,
597
+ started_at: started_at,
598
+ completed_at: completed_at,
599
+ total_cost: calculate_cost(raw_result),
600
+ audio_minutes: raw_result[:duration] ? raw_result[:duration] / 60.0 : nil,
601
+ status: :success,
602
+ tenant_id: tenant_id
603
+ )
604
+ end
605
+
606
+ # Calculates cost for transcription
607
+ #
608
+ # @param raw_result [Hash] Raw transcription result
609
+ # @return [Float] Cost in USD
610
+ def calculate_cost(raw_result)
611
+ # Get duration in minutes
612
+ duration_minutes = raw_result[:duration] ? raw_result[:duration] / 60.0 : 0
613
+
614
+ # Check if response has cost info
615
+ if raw_result[:raw_response].respond_to?(:cost) && raw_result[:raw_response].cost
616
+ return raw_result[:raw_response].cost
617
+ end
618
+
619
+ # Estimate based on model and duration
620
+ model = raw_result[:model].to_s
621
+ price_per_minute = case model
622
+ when /whisper-1/
623
+ 0.006
624
+ when /gpt-4o-transcribe/
625
+ 0.01
626
+ when /gpt-4o-mini-transcribe/
627
+ 0.005
628
+ else
629
+ 0.006 # Default to whisper pricing
630
+ end
631
+
632
+ duration_minutes * price_per_minute
633
+ end
634
+
635
+ # Resolves the model to use
636
+ def resolved_model
637
+ @model || self.class.model
638
+ end
639
+
640
+ # Resolves the language to use
641
+ def resolved_language
642
+ @runtime_language || self.class.language
643
+ end
644
+
645
+ # Returns max retries from reliability config
646
+ def reliability_max_retries
647
+ config = self.class.reliability_config
648
+ config&.max_retries || 3
649
+ end
650
+
651
+ # Checks if error is retryable
652
+ def retryable_error?(error)
653
+ message = error.message.to_s.downcase
654
+ retryable_patterns = ["rate limit", "timeout", "503", "502", "429", "overloaded"]
655
+ retryable_patterns.any? { |pattern| message.include?(pattern) }
656
+ end
657
+
658
+ # Calculates exponential backoff delay
659
+ def calculate_backoff(attempt)
660
+ config = self.class.reliability_config
661
+ base = config&.backoff == :constant ? 1.0 : 0.4
662
+ max_delay = 10.0
663
+
664
+ delay = base * (2**(attempt - 1))
665
+ [delay, max_delay].min
666
+ end
667
+ end
668
+ end
669
+ end