riffer 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. checksums.yaml +4 -4
  2. data/.agents/code-style.md +2 -1
  3. data/.agents/rbs-inline.md +123 -0
  4. data/.agents/rdoc.md +14 -8
  5. data/.release-please-manifest.json +1 -1
  6. data/AGENTS.md +3 -0
  7. data/CHANGELOG.md +24 -0
  8. data/CLAUDE.md +1 -0
  9. data/Guardfile +7 -0
  10. data/Rakefile +23 -1
  11. data/Steepfile +14 -0
  12. data/docs/03_AGENTS.md +24 -1
  13. data/docs/06_STREAM_EVENTS.md +34 -0
  14. data/docs/08_EVALS.md +316 -0
  15. data/docs/09_GUARDRAILS.md +386 -0
  16. data/lib/riffer/agent/response.rb +51 -0
  17. data/lib/riffer/agent.rb +205 -116
  18. data/lib/riffer/config.rb +19 -13
  19. data/lib/riffer/core.rb +5 -6
  20. data/lib/riffer/evals/evaluator.rb +107 -0
  21. data/lib/riffer/evals/evaluators/answer_relevancy.rb +60 -0
  22. data/lib/riffer/evals/evaluators.rb +59 -0
  23. data/lib/riffer/evals/judge.rb +110 -0
  24. data/lib/riffer/evals/metric.rb +66 -0
  25. data/lib/riffer/evals/profile.rb +100 -0
  26. data/lib/riffer/evals/result.rb +70 -0
  27. data/lib/riffer/evals/run_result.rb +99 -0
  28. data/lib/riffer/evals/runner.rb +47 -0
  29. data/lib/riffer/evals.rb +11 -0
  30. data/lib/riffer/guardrail.rb +99 -0
  31. data/lib/riffer/guardrails/max_length.rb +69 -0
  32. data/lib/riffer/guardrails/modification.rb +38 -0
  33. data/lib/riffer/guardrails/result.rb +94 -0
  34. data/lib/riffer/guardrails/runner.rb +104 -0
  35. data/lib/riffer/guardrails/tripwire.rb +60 -0
  36. data/lib/riffer/guardrails.rb +10 -0
  37. data/lib/riffer/helpers/class_name_converter.rb +3 -5
  38. data/lib/riffer/helpers/dependencies.rb +3 -5
  39. data/lib/riffer/helpers/validations.rb +3 -5
  40. data/lib/riffer/helpers.rb +11 -0
  41. data/lib/riffer/messages/assistant.rb +9 -16
  42. data/lib/riffer/messages/base.rb +6 -7
  43. data/lib/riffer/messages/converter.rb +4 -4
  44. data/lib/riffer/messages/system.rb +2 -1
  45. data/lib/riffer/messages/tool.rb +9 -22
  46. data/lib/riffer/messages/user.rb +2 -1
  47. data/lib/riffer/messages.rb +1 -0
  48. data/lib/riffer/providers/amazon_bedrock.rb +16 -8
  49. data/lib/riffer/providers/anthropic.rb +15 -7
  50. data/lib/riffer/providers/base.rb +8 -14
  51. data/lib/riffer/providers/open_ai.rb +19 -9
  52. data/lib/riffer/providers/repository.rb +9 -10
  53. data/lib/riffer/providers/test.rb +17 -22
  54. data/lib/riffer/providers.rb +1 -0
  55. data/lib/riffer/stream_events/base.rb +5 -6
  56. data/lib/riffer/stream_events/guardrail_modification.rb +43 -0
  57. data/lib/riffer/stream_events/guardrail_tripwire.rb +52 -0
  58. data/lib/riffer/stream_events/reasoning_delta.rb +4 -10
  59. data/lib/riffer/stream_events/reasoning_done.rb +4 -10
  60. data/lib/riffer/stream_events/text_delta.rb +4 -10
  61. data/lib/riffer/stream_events/text_done.rb +4 -10
  62. data/lib/riffer/stream_events/token_usage_done.rb +4 -10
  63. data/lib/riffer/stream_events/tool_call_delta.rb +6 -12
  64. data/lib/riffer/stream_events/tool_call_done.rb +7 -14
  65. data/lib/riffer/stream_events.rb +1 -0
  66. data/lib/riffer/token_usage.rb +10 -23
  67. data/lib/riffer/tool.rb +58 -76
  68. data/lib/riffer/tools/param.rb +12 -15
  69. data/lib/riffer/tools/params.rb +8 -19
  70. data/lib/riffer/tools/response.rb +14 -20
  71. data/lib/riffer/tools.rb +1 -0
  72. data/lib/riffer/version.rb +2 -1
  73. data/lib/riffer.rb +22 -26
  74. data/sig/generated/riffer/agent/response.rbs +42 -0
  75. data/sig/generated/riffer/agent.rbs +175 -0
  76. data/sig/generated/riffer/config.rbs +60 -0
  77. data/sig/generated/riffer/core.rbs +17 -0
  78. data/sig/generated/riffer/evals/evaluator.rbs +66 -0
  79. data/sig/generated/riffer/evals/evaluators/answer_relevancy.rbs +24 -0
  80. data/sig/generated/riffer/evals/evaluators.rbs +42 -0
  81. data/sig/generated/riffer/evals/judge.rbs +52 -0
  82. data/sig/generated/riffer/evals/metric.rbs +46 -0
  83. data/sig/generated/riffer/evals/profile.rbs +60 -0
  84. data/sig/generated/riffer/evals/result.rbs +49 -0
  85. data/sig/generated/riffer/evals/run_result.rbs +62 -0
  86. data/sig/generated/riffer/evals/runner.rbs +27 -0
  87. data/sig/generated/riffer/evals.rbs +10 -0
  88. data/sig/generated/riffer/guardrail.rbs +79 -0
  89. data/sig/generated/riffer/guardrails/max_length.rbs +38 -0
  90. data/sig/generated/riffer/guardrails/modification.rbs +31 -0
  91. data/sig/generated/riffer/guardrails/result.rbs +73 -0
  92. data/sig/generated/riffer/guardrails/runner.rbs +51 -0
  93. data/sig/generated/riffer/guardrails/tripwire.rbs +45 -0
  94. data/sig/generated/riffer/guardrails.rbs +9 -0
  95. data/sig/generated/riffer/helpers/class_name_converter.rbs +11 -0
  96. data/sig/generated/riffer/helpers/dependencies.rbs +25 -0
  97. data/sig/generated/riffer/helpers/validations.rbs +11 -0
  98. data/sig/generated/riffer/helpers.rbs +10 -0
  99. data/sig/generated/riffer/messages/assistant.rbs +41 -0
  100. data/sig/generated/riffer/messages/base.rbs +24 -0
  101. data/sig/generated/riffer/messages/converter.rbs +18 -0
  102. data/sig/generated/riffer/messages/system.rbs +11 -0
  103. data/sig/generated/riffer/messages/tool.rbs +41 -0
  104. data/sig/generated/riffer/messages/user.rbs +11 -0
  105. data/sig/generated/riffer/messages.rbs +11 -0
  106. data/sig/generated/riffer/providers/amazon_bedrock.rbs +42 -0
  107. data/sig/generated/riffer/providers/anthropic.rbs +39 -0
  108. data/sig/generated/riffer/providers/base.rbs +37 -0
  109. data/sig/generated/riffer/providers/open_ai.rbs +46 -0
  110. data/sig/generated/riffer/providers/repository.rbs +12 -0
  111. data/sig/generated/riffer/providers/test.rbs +41 -0
  112. data/sig/generated/riffer/providers.rbs +10 -0
  113. data/sig/generated/riffer/stream_events/base.rbs +19 -0
  114. data/sig/generated/riffer/stream_events/guardrail_modification.rbs +37 -0
  115. data/sig/generated/riffer/stream_events/guardrail_tripwire.rbs +37 -0
  116. data/sig/generated/riffer/stream_events/reasoning_delta.rbs +16 -0
  117. data/sig/generated/riffer/stream_events/reasoning_done.rbs +16 -0
  118. data/sig/generated/riffer/stream_events/text_delta.rbs +15 -0
  119. data/sig/generated/riffer/stream_events/text_done.rbs +15 -0
  120. data/sig/generated/riffer/stream_events/token_usage_done.rbs +19 -0
  121. data/sig/generated/riffer/stream_events/tool_call_delta.rbs +21 -0
  122. data/sig/generated/riffer/stream_events/tool_call_done.rbs +24 -0
  123. data/sig/generated/riffer/stream_events.rbs +13 -0
  124. data/sig/generated/riffer/token_usage.rbs +48 -0
  125. data/sig/generated/riffer/tool.rbs +92 -0
  126. data/sig/generated/riffer/tools/param.rbs +39 -0
  127. data/sig/generated/riffer/tools/params.rbs +38 -0
  128. data/sig/generated/riffer/tools/response.rbs +62 -0
  129. data/sig/generated/riffer/tools.rbs +10 -0
  130. data/sig/generated/riffer/version.rbs +5 -0
  131. data/sig/generated/riffer.rbs +36 -0
  132. metadata +118 -6
  133. data/sig/riffer.rbs +0 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f5f2d3838897b17320d17d3cc95b1db37ae8af023d6dcac8c5c9ce0fa5a6e847
4
- data.tar.gz: e7e75acdd198f147747d06bcaaddeba27f3491bcc12edd2ac0e35f464d47e2f0
3
+ metadata.gz: 0cf5738c3a643db6e1626862456336f6b4bba78bbb1e0efa599ebf61e0b80ee5
4
+ data.tar.gz: 683586055701aa2ffb62dd88559ceac640b6e0221120dee6fa589ef3e44a6cb4
5
5
  SHA512:
6
- metadata.gz: 557d945daf22b6a45cb838532a5df0c811d2ac73377b2a5c445efb0dc06a8a2e9c234d14691333d699c6d0dddda7299a5e3bb5a676e4ae72d32b1f631c9694b4
7
- data.tar.gz: aa35cbff64ef8c11e72e76c3c931589e8163f751b96c066cfef74a5f7779fabb8891565c4d215b2ea9b29fca45fb991ebd49260462c203aa9b73328f39d913ca
6
+ metadata.gz: 2c3fed6e20f275474ac0e9388a9b5200eab9b899c32534c7ba67b2df5e804097774c35821045e1dfbdaf65db426f7d56a827cd86de2eb32cbd8b65cb7aa8d343
7
+ data.tar.gz: 3db7dd0aa5fde3a20f97bd462b634ef01d8221db5f9c1d6c71a6430d915dc17b2ddebdd625d38514731ba63d6803cab6e94f5f61205e69cbf8d8d47237a3ee42
@@ -8,10 +8,11 @@
8
8
 
9
9
  ## Required Header
10
10
 
11
- All Ruby files must include:
11
+ All Ruby files in `lib/` must include:
12
12
 
13
13
  ```ruby
14
14
  # frozen_string_literal: true
15
+ # rbs_inline: enabled
15
16
  ```
16
17
 
17
18
  ## Error Handling
@@ -0,0 +1,123 @@
1
+ # RBS Inline
2
+
3
+ Type annotations are added directly in Ruby source files using [rbs-inline](https://github.com/soutaro/rbs-inline).
4
+
5
+ ## Magic Comment
6
+
7
+ Every `lib/**/*.rb` file must include the `rbs_inline: enabled` comment on line 2:
8
+
9
+ ```ruby
10
+ # frozen_string_literal: true
11
+ # rbs_inline: enabled
12
+ ```
13
+
14
+ ## Annotation Syntax
15
+
16
+ The **`#:`** prefix is used — standalone lines above methods (type signatures) or inline on the same line (attributes, constants).
17
+
18
+ ### Method Parameters and Return Types
19
+
20
+ Use a single `#:` line above the method with the RBS method signature:
21
+
22
+ ```ruby
23
+ #: (String, Integer) -> bool
24
+ def valid?(name, age)
25
+ ```
26
+
27
+ #### Parameter Mapping
28
+
29
+ | Ruby param | RBS signature |
30
+ | ---------------------------- | ------------------------ |
31
+ | `def foo(x)` | `(Type)` |
32
+ | `def foo(x = nil)` | `(?Type?)` |
33
+ | `def foo(x = val)` | `(?Type)` |
34
+ | `def foo(x:)` | `(x: Type)` |
35
+ | `def foo(x: nil)` | `(?x: Type?)` |
36
+ | `def foo(x: val)` | `(?x: Type)` |
37
+ | `def foo(*args)` | `(*untyped)` |
38
+ | `def foo(**kwargs)` | `(**untyped)` |
39
+ | `def foo(&block)` (required) | `() { (Type) -> void }` |
40
+ | `def foo(&block)` (optional) | `() ?{ (Type) -> void }` |
41
+ | `def foo(...)` | `(*untyped, **untyped)` |
42
+
43
+ #### Examples
44
+
45
+ ```ruby
46
+ # No parameters
47
+ #: () -> String
48
+ def name
49
+
50
+ # Positional parameters
51
+ #: (String, Integer) -> bool
52
+ def valid?(name, age)
53
+
54
+ # Optional positional parameter
55
+ #: (?String?) -> String
56
+ def self.identifier(value = nil)
57
+
58
+ # Required keyword parameters
59
+ #: (input: String, output: String) -> Riffer::Evals::Result
60
+ def evaluate(input:, output:)
61
+
62
+ # Mixed keyword parameters (required + optional)
63
+ #: (input: String, output: String, ?context: Hash[Symbol, untyped]?) -> Riffer::Evals::Result
64
+ def evaluate(input:, output:, context: nil)
65
+
66
+ # Positional + keyword parameters
67
+ #: (String, ?tool_context: Hash[Symbol, untyped]?) -> String
68
+ def generate(prompt, tool_context: nil)
69
+
70
+ # Splat/double-splat
71
+ #: (**untyped) -> void
72
+ def initialize(**options)
73
+
74
+ # Forward arguments
75
+ #: (*untyped, **untyped) -> String
76
+ def self.generate(...)
77
+
78
+ # Block parameter (required)
79
+ #: () { (Riffer::Messages::Base) -> void } -> self
80
+ def on_message(&block)
81
+
82
+ # Block parameter (optional)
83
+ #: () ?{ (Riffer::Config) -> void } -> void
84
+ def configure(&block)
85
+ ```
86
+
87
+ ### Attributes
88
+
89
+ ```ruby
90
+ attr_reader :name #: String
91
+ attr_reader :items #: Array[String]
92
+ ```
93
+
94
+ ### Constants
95
+
96
+ ```ruby
97
+ VERSION = "1.0.0" #: String
98
+ DEFAULTS = {}.freeze #: Hash[Symbol, untyped]
99
+ ```
100
+
101
+ ## Common Type Patterns
102
+
103
+ | Pattern | Meaning |
104
+ | ------------------------- | --------------------------- |
105
+ | `String?` | Optional (String or nil) |
106
+ | `(String \| Integer)` | Union type |
107
+ | `Array[String]` | Typed array |
108
+ | `Hash[Symbol, untyped]` | Typed hash |
109
+ | `^(String) -> void` | Block/proc type |
110
+ | `singleton(Riffer::Tool)` | Class object (not instance) |
111
+ | `bool` | Boolean (true or false) |
112
+ | `untyped` | Any type |
113
+ | `void` | No meaningful return |
114
+
115
+ ## Workflow
116
+
117
+ After changing type annotations:
118
+
119
+ 1. Run `bundle exec rake rbs:generate` to regenerate `sig/generated/` files
120
+ 2. Commit both the source changes and the generated `.rbs` files
121
+ 3. CI checks for drift between source annotations and committed `.rbs` files
122
+
123
+ Use `bundle exec rake rbs:watch` during development to auto-regenerate on file changes.
data/.agents/rdoc.md CHANGED
@@ -1,24 +1,30 @@
1
1
  # RDoc Documentation
2
2
 
3
- Use pure RDoc comments for public APIs (not YARD).
3
+ Use RDoc prose comments for public API descriptions and RBS inline annotations for types.
4
4
 
5
- ## Parameters
5
+ ## Parameters and Return Types
6
6
 
7
- Use definition list syntax (`::`):
7
+ Describe parameters in the RDoc prose comment. Use a single `#:` line for the RBS method signature (see [rbs-inline.md](rbs-inline.md) for the full type annotation syntax):
8
8
 
9
9
  ```ruby
10
10
  # Creates a new agent.
11
11
  #
12
- # name:: String - the agent name
13
- # options:: Hash - optional configuration
12
+ # +name+ - the agent name.
13
+ # +options+ - optional configuration.
14
+ #
15
+ #: (String, ?options: Hash[Symbol, untyped]) -> void
16
+ def initialize(name, options: {})
14
17
  ```
15
18
 
16
- ## Return Values
19
+ ## Attributes and Constants
17
20
 
18
- Document with prose:
21
+ Use `#:` inline syntax (on the same line) for attribute and constant types:
19
22
 
20
23
  ```ruby
21
- # Returns String - the agent identifier.
24
+ # The agent name.
25
+ attr_reader :name #: String
26
+
27
+ DEFAULT_TIMEOUT = 10 #: Integer
22
28
  ```
23
29
 
24
30
  ## Exceptions
@@ -1,3 +1,3 @@
1
1
  {
2
- ".": "0.11.0"
2
+ ".": "0.12.0"
3
3
  }
data/AGENTS.md CHANGED
@@ -16,6 +16,7 @@ Ruby gem framework for building AI-powered agents with LLM provider adapters.
16
16
  - [Code Style](.agents/code-style.md) - StandardRB and comment conventions
17
17
  - [RDoc](.agents/rdoc.md) - Documentation format for public APIs
18
18
  - [Providers](.agents/providers.md) - Adding new LLM provider adapters
19
+ - [RBS Inline](.agents/rbs-inline.md) - Type annotations with rbs-inline
19
20
 
20
21
  ## Commands
21
22
 
@@ -25,4 +26,6 @@ Ruby gem framework for building AI-powered agents with LLM provider adapters.
25
26
  | `bundle exec rake test` | Run tests only |
26
27
  | `bundle exec rake standard` | Check code style |
27
28
  | `bundle exec rake standard:fix` | Auto-fix style issues |
29
+ | `bundle exec rake rbs:generate` | Generate RBS type signatures |
30
+ | `bundle exec rake rbs:watch` | Watch and regenerate RBS files |
28
31
  | `bin/console` | Interactive console |
data/CHANGELOG.md CHANGED
@@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.12.0](https://github.com/janeapp/riffer/compare/riffer/v0.11.0...riffer/v0.12.0) (2026-02-11)
9
+
10
+
11
+ ### ⚠ BREAKING CHANGES
12
+
13
+ * Agent#generate now returns Riffer::Agent::Response instead of String. Use response.content or response.to_s for the text.
14
+
15
+ ### Features
16
+
17
+ * add Claude Code Review GitHub Action ([#108](https://github.com/janeapp/riffer/issues/108)) ([f4b281c](https://github.com/janeapp/riffer/commit/f4b281c43e6ad50430c38323bcb876b60efc994a))
18
+ * add evals primitive for LLM-as-judge evaluations ([#101](https://github.com/janeapp/riffer/issues/101)) ([8fd7b36](https://github.com/janeapp/riffer/commit/8fd7b369f2bd0236ea4c7d30cc12e71b960211dd))
19
+ * add guardrails primitive for input/output processing ([#100](https://github.com/janeapp/riffer/issues/100)) ([48d8bad](https://github.com/janeapp/riffer/commit/48d8badce98c0bf9110bafebd3097e25f46c8444))
20
+ * add inline RBS type annotations with Steep type checking ([#103](https://github.com/janeapp/riffer/issues/103)) ([02ae559](https://github.com/janeapp/riffer/commit/02ae559fa580ef4353bd969f2e50e056ab538e2d))
21
+
22
+
23
+ ### Bug Fixes
24
+
25
+ * correct RBS inline annotations and remove ivar declarations ([#109](https://github.com/janeapp/riffer/issues/109)) ([d59076d](https://github.com/janeapp/riffer/commit/d59076d40b88f581b51ddbb9ee3d50ed57e84451))
26
+
27
+
28
+ ### Miscellaneous Chores
29
+
30
+ * set next version ([#111](https://github.com/janeapp/riffer/issues/111)) ([faf41b9](https://github.com/janeapp/riffer/commit/faf41b92032e302c3f0d2d06ab93140137c1b199))
31
+
8
32
  ## [0.11.0](https://github.com/janeapp/riffer/compare/riffer/v0.10.0...riffer/v0.11.0) (2026-02-04)
9
33
 
10
34
 
data/CLAUDE.md ADDED
@@ -0,0 +1 @@
1
+ AGENTS.md
data/Guardfile ADDED
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ guard :shell do
4
+ watch(%r{^lib/.*\.rb$}) do
5
+ system("bundle exec rake rbs:generate")
6
+ end
7
+ end
data/Rakefile CHANGED
@@ -23,4 +23,26 @@ end
23
23
 
24
24
  task docs: :rdoc
25
25
 
26
- task default: %i[test standard]
26
+ namespace :rbs do
27
+ desc "Generate RBS type signatures from inline annotations"
28
+ task :generate do
29
+ sh "bundle exec rbs-inline --output sig/generated lib"
30
+ end
31
+
32
+ desc "Watch lib/ for changes and regenerate RBS files"
33
+ task :watch do
34
+ require "guard"
35
+ require "guard/commander"
36
+
37
+ Guard.start(no_interactions: true)
38
+ end
39
+ end
40
+
41
+ namespace :steep do
42
+ desc "Run Steep type checker"
43
+ task :check do
44
+ sh "bundle exec steep check"
45
+ end
46
+ end
47
+
48
+ task default: %i[test standard steep:check]
data/Steepfile ADDED
@@ -0,0 +1,14 @@
1
+ D = Steep::Diagnostic
2
+
3
+ target :lib do
4
+ signature "sig/generated"
5
+
6
+ check "lib"
7
+
8
+ library "logger"
9
+ library "anthropic"
10
+ library "openai"
11
+ library "aws-sdk-bedrockruntime"
12
+
13
+ configure_code_diagnostics(D::Ruby.lenient)
14
+ end
data/docs/03_AGENTS.md CHANGED
@@ -100,15 +100,38 @@ class MyAgent < Riffer::Agent
100
100
  end
101
101
  ```
102
102
 
103
+ ### guardrail
104
+
105
+ Registers guardrails for pre/post processing of messages. Pass the guardrail class and any options:
106
+
107
+ ```ruby
108
+ class MyAgent < Riffer::Agent
109
+ model 'openai/gpt-4o'
110
+
111
+ # Input-only guardrail
112
+ guardrail :before, with: InputValidator
113
+
114
+ # Output-only guardrail
115
+ guardrail :after, with: ResponseFilter
116
+
117
+ # Both input and output, with options
118
+ guardrail :around, with: Riffer::Guardrails::MaxLength, max: 1000
119
+ end
120
+ ```
121
+
122
+ See [Guardrails](09_GUARDRAILS.md) for detailed documentation.
123
+
103
124
  ## Instance Methods
104
125
 
105
126
  ### generate
106
127
 
107
- Generates a response synchronously:
128
+ Generates a response synchronously. Returns a `Riffer::Agent::Response` object:
108
129
 
109
130
  ```ruby
110
131
  # Class method (recommended for simple calls)
111
132
  response = MyAgent.generate('Hello')
133
+ puts response.content # Access the response text
134
+ puts response.blocked? # Check if guardrail blocked (always false without guardrails)
112
135
 
113
136
  # Instance method (when you need message history or callbacks)
114
137
  agent = MyAgent.new
@@ -109,6 +109,40 @@ event.role # => "assistant"
109
109
  event.content # => "Let me think about this step by step..."
110
110
  ```
111
111
 
112
+ ### GuardrailTripwire
113
+
114
+ Emitted when a guardrail blocks execution during streaming:
115
+
116
+ ```ruby
117
+ agent.stream("Hello").each do |event|
118
+ case event
119
+ when Riffer::StreamEvents::GuardrailTripwire
120
+ puts "Blocked by: #{event.guardrail_id}"
121
+ puts "Reason: #{event.reason}"
122
+ puts "Phase: #{event.phase}" # :before or :after
123
+ end
124
+ end
125
+ ```
126
+
127
+ See [Guardrails](09_GUARDRAILS.md) for more information.
128
+
129
+ ### GuardrailModification
130
+
131
+ Emitted when a guardrail transforms data during streaming:
132
+
133
+ ```ruby
134
+ agent.stream("Hello").each do |event|
135
+ case event
136
+ when Riffer::StreamEvents::GuardrailModification
137
+ puts "Modified by: #{event.guardrail_id}"
138
+ puts "Phase: #{event.phase}" # :before or :after
139
+ puts "Changed: #{event.message_indices}" # Array of affected indices
140
+ end
141
+ end
142
+ ```
143
+
144
+ See [Guardrails](09_GUARDRAILS.md) for more information.
145
+
112
146
  ### TokenUsageDone
113
147
 
114
148
  Emitted when token usage data is available at the end of a response:
data/docs/08_EVALS.md ADDED
@@ -0,0 +1,316 @@
1
+ # Evals
2
+
3
+ Evals let you measure the quality of agent outputs using LLM-as-judge evaluations.
4
+
5
+ ## Overview
6
+
7
+ Riffer Evals provides a framework for evaluating agent responses against configurable quality metrics. It uses an LLM-as-judge approach where a separate model evaluates the outputs of your agents.
8
+
9
+ Key concepts:
10
+
11
+ - **Evaluators** - Classes that evaluate input/output pairs and return scores
12
+ - **Metrics** - Evaluator configurations with pass/fail thresholds
13
+ - **Profiles** - Collections of metrics that can be included in agents
14
+ - **Results** - Individual evaluation scores and aggregate pass/fail status
15
+
16
+ ## Quick Start
17
+
18
+ ```ruby
19
+ # 1. Configure the judge model
20
+ Riffer.config.evals.judge_model = "anthropic/claude-opus-4-5-20251101"
21
+
22
+ # 2. Define an eval profile
23
+ module QualityEvals
24
+ include Riffer::Evals::Profile
25
+
26
+ ai_evals do
27
+ metric :answer_relevancy, min: 0.85
28
+ end
29
+ end
30
+
31
+ # 3. Include in your agent
32
+ class MyAgent < Riffer::Agent
33
+ include QualityEvals
34
+ model "anthropic/claude-haiku-4-5-20251001"
35
+ instructions "You are a helpful assistant."
36
+ end
37
+
38
+ # 4. Run evals
39
+ result = MyAgent.run_eval(input: "What is Ruby?")
40
+ result.passed? # => true/false
41
+ result.aggregate_score # => 0.91
42
+ ```
43
+
44
+ ## Configuration
45
+
46
+ Before using evals, configure the judge model:
47
+
48
+ ```ruby
49
+ Riffer.config.evals.judge_model = "anthropic/claude-opus-4-5-20251101"
50
+ ```
51
+
52
+ The judge model is the LLM that evaluates agent outputs. You can use any configured provider.
53
+
54
+ ## Built-in Evaluators
55
+
56
+ ### answer_relevancy
57
+
58
+ Evaluates how well a response addresses the input question.
59
+
60
+ - **higher_is_better**: true
61
+ - **Score range**: 0.0 to 1.0
62
+ - **1.0**: Perfectly relevant, directly addresses the question
63
+ - **0.7-0.9**: Mostly relevant with minor tangents
64
+ - **0.4-0.6**: Partially relevant, some off-topic content
65
+ - **0.1-0.3**: Mostly irrelevant
66
+ - **0.0**: Completely irrelevant
67
+
68
+ ```ruby
69
+ ai_evals do
70
+ metric :answer_relevancy, min: 0.85
71
+ end
72
+ ```
73
+
74
+ ## Eval Profiles
75
+
76
+ Eval profiles define which evaluators to run and their pass/fail thresholds.
77
+
78
+ ### Defining a Profile
79
+
80
+ ```ruby
81
+ module QualityEvals
82
+ include Riffer::Evals::Profile
83
+
84
+ ai_evals do
85
+ metric :answer_relevancy, min: 0.85
86
+ end
87
+ end
88
+ ```
89
+
90
+ ### Metric Options
91
+
92
+ - `min` - Minimum score to pass (for higher_is_better evaluators)
93
+ - `max` - Maximum score to pass (for lower_is_better evaluators)
94
+ - `weight` - Weight for aggregate scoring (default: 1.0)
95
+
96
+ ```ruby
97
+ ai_evals do
98
+ metric :answer_relevancy, min: 0.85, weight: 2.0 # Weighted more heavily
99
+ end
100
+ ```
101
+
102
+ ### Including in Agents
103
+
104
+ ```ruby
105
+ class MyAgent < Riffer::Agent
106
+ include QualityEvals
107
+ model "anthropic/claude-haiku-4-5-20251001"
108
+ end
109
+ ```
110
+
111
+ ## Running Evals
112
+
113
+ Once a profile is included, call `.eval` on the agent class:
114
+
115
+ ```ruby
116
+ result = MyAgent.run_eval(
117
+ input: "What is the capital of France?",
118
+ context: { ground_truth: "Paris" } # Optional context
119
+ )
120
+ ```
121
+
122
+ ### RunResult Object
123
+
124
+ The eval method returns a `Riffer::Evals::RunResult`:
125
+
126
+ ```ruby
127
+ result.passed? # => true if all metrics pass thresholds
128
+ result.aggregate_score # => Weighted average of normalized scores (0.0-1.0)
129
+ result.failures # => Array of Result objects that failed
130
+ result.results # => Array of all Result objects
131
+ result.input # => The input that was evaluated
132
+ result.output # => The agent's output
133
+ result.to_h # => Hash representation
134
+ ```
135
+
136
+ ### Result Object
137
+
138
+ Individual evaluation results:
139
+
140
+ ```ruby
141
+ result.results.first.evaluator # => "answer_relevancy"
142
+ result.results.first.score # => 0.92
143
+ result.results.first.reason # => "The response directly addresses..."
144
+ result.results.first.higher_is_better # => true
145
+ ```
146
+
147
+ ## Defining Custom Evaluators
148
+
149
+ Create evaluators by subclassing `Riffer::Evals::Evaluator`:
150
+
151
+ ```ruby
152
+ # app/evals/medical_accuracy_evaluator.rb
153
+ class MedicalAccuracyEvaluator < Riffer::Evals::Evaluator
154
+ identifier "medical_accuracy"
155
+ description "Evaluates medical information accuracy"
156
+ higher_is_better true
157
+ judge_model "anthropic/claude-opus-4-5-20251101" # Optional override
158
+
159
+ SYSTEM_PROMPT = <<~PROMPT
160
+ You are an evaluation assistant that assesses medical accuracy.
161
+
162
+ Use the evaluation tool to submit your score (0.0-1.0) and reasoning.
163
+ PROMPT
164
+
165
+ def evaluate(input:, output:, context: nil)
166
+ user_prompt = <<~PROMPT
167
+ Question: #{input}
168
+ Response: #{output}
169
+ Ground truth: #{context[:ground_truth]}
170
+ PROMPT
171
+
172
+ evaluation = judge.evaluate(
173
+ system_prompt: SYSTEM_PROMPT,
174
+ user_prompt: user_prompt
175
+ )
176
+
177
+ result(score: evaluation[:score], reason: evaluation[:reason])
178
+ end
179
+ end
180
+ ```
181
+
182
+ ### Registering Custom Evaluators
183
+
184
+ Register custom evaluators in your app initialization. Built-in evaluators are always available.
185
+
186
+ ```ruby
187
+ # config/initializers/riffer.rb
188
+ Riffer::Evals::Evaluators::Repository.register(:medical_accuracy, MedicalAccuracyEvaluator)
189
+ ```
190
+
191
+ ### Evaluator DSL
192
+
193
+ Class methods:
194
+
195
+ - `identifier(value)` - Set the evaluator identifier (defaults to snake_case class name)
196
+ - `description(value)` - Human-readable description
197
+ - `higher_is_better(value)` - Whether higher scores are better (default: true)
198
+ - `judge_model(value)` - Override the global judge model
199
+
200
+ Instance methods:
201
+
202
+ - `evaluate(input:, output:, context:)` - Must be implemented, returns a Result
203
+ - `judge` - Returns a Judge instance for LLM-as-judge calls
204
+ - `result(score:, reason:, metadata:)` - Helper to build Result objects
205
+
206
+ ### Judge Options
207
+
208
+ The `judge.evaluate` method accepts either `system_prompt:` and `user_prompt:` or a `messages:` array:
209
+
210
+ ```ruby
211
+ # Using system_prompt and user_prompt
212
+ evaluation = judge.evaluate(
213
+ system_prompt: "You are a judge.",
214
+ user_prompt: "Evaluate this response."
215
+ )
216
+
217
+ # Using messages array (for more control)
218
+ evaluation = judge.evaluate(
219
+ messages: [
220
+ { role: "system", content: "You are a judge." },
221
+ { role: "user", content: "Evaluate this response." }
222
+ ]
223
+ )
224
+ ```
225
+
226
+ The Judge uses tool calling internally to get structured output. An `evaluation` tool with `score` (Float) and `reason` (String) parameters is automatically provided to the judge model, so your prompts should instruct the model to use the evaluation tool rather than respond with raw JSON.
227
+
228
+ ### Rule-Based Evaluators
229
+
230
+ Evaluators don't have to use LLM-as-judge:
231
+
232
+ ```ruby
233
+ class LengthEvaluator < Riffer::Evals::Evaluator
234
+ identifier "response_length"
235
+ description "Checks response is within expected length"
236
+ higher_is_better true
237
+
238
+ def evaluate(input:, output:, context: nil)
239
+ min_length = context&.dig(:min_length) || 50
240
+ max_length = context&.dig(:max_length) || 500
241
+
242
+ length = output.length
243
+
244
+ if length < min_length
245
+ score = length.to_f / min_length
246
+ reason = "Response too short (#{length} < #{min_length})"
247
+ elsif length > max_length
248
+ score = max_length.to_f / length
249
+ reason = "Response too long (#{length} > #{max_length})"
250
+ else
251
+ score = 1.0
252
+ reason = "Response length is appropriate"
253
+ end
254
+
255
+ result(score: score, reason: reason)
256
+ end
257
+ end
258
+ ```
259
+
260
+ ## Aggregate Scoring
261
+
262
+ The `aggregate_score` normalizes all scores so higher is always better:
263
+
264
+ - For `higher_is_better` evaluators: score is used directly
265
+ - For `lower_is_better` evaluators: score is inverted (1.0 - score)
266
+
267
+ Scores are then weighted:
268
+
269
+ ```ruby
270
+ # With weights: relevancy=2.0, toxicity=1.0
271
+ # relevancy score: 0.9 (higher_is_better)
272
+ # toxicity score: 0.1 (lower_is_better)
273
+
274
+ # Normalized: relevancy=0.9, toxicity=0.9 (1.0 - 0.1)
275
+ # Weighted average: (0.9 * 2.0 + 0.9 * 1.0) / (2.0 + 1.0) = 0.9
276
+ ```
277
+
278
+ ## Example: Full Integration
279
+
280
+ ```ruby
281
+ # config/initializers/riffer.rb
282
+ Riffer.configure do |config|
283
+ config.anthropic.api_key = ENV["ANTHROPIC_API_KEY"]
284
+ config.evals.judge_model = "anthropic/claude-opus-4-5-20251101"
285
+ end
286
+
287
+ # app/evals/quality_evals.rb
288
+ module QualityEvals
289
+ include Riffer::Evals::Profile
290
+
291
+ ai_evals do
292
+ metric :answer_relevancy, min: 0.85, weight: 2.0
293
+ end
294
+ end
295
+
296
+ # app/agents/support_agent.rb
297
+ class SupportAgent < Riffer::Agent
298
+ include QualityEvals
299
+
300
+ model "anthropic/claude-opus-4-5-20251101"
301
+ instructions "You are a helpful customer support agent."
302
+ end
303
+
304
+ # test/agents/support_agent_test.rb
305
+ class SupportAgentTest < Minitest::Test
306
+ def test_response_quality
307
+ result = SupportAgent.run_eval
308
+ input: "How do I reset my password?",
309
+ context: {}
310
+ )
311
+
312
+ assert result.passed?, "Expected eval to pass: #{result.failures.map(&:reason)}"
313
+ assert result.aggregate_score >= 0.85
314
+ end
315
+ end
316
+ ```